Compare commits
22 Commits
6f97554195
...
4969025722
Author | SHA1 | Date | |
---|---|---|---|
4969025722 | |||
91fb433258 | |||
db18d88f35 | |||
e19132c33f | |||
e0c99bc068 | |||
2d4a10da05 | |||
69d6442966 | |||
7d7ced9b81 | |||
9e8de4b4c7 | |||
046835064d | |||
31b81fa33e | |||
c425ebcb00 | |||
9eed49c344 | |||
4123118020 | |||
b99838696e | |||
bbe93d0939 | |||
17acc478b7 | |||
ee6e850174 | |||
f090374caf | |||
ef3e8ef337 | |||
b75d5fc86f | |||
480072779c |
19
README.md
Normal file
19
README.md
Normal file
@ -0,0 +1,19 @@
|
||||
# BNP PDF statement parser
|
||||
|
||||
Depends on `pdftotext`.
|
||||
|
||||
Assuming file hierarchy like:
|
||||
|
||||
```
|
||||
.
|
||||
├── compte_de_cheques/
|
||||
│ ├── 2022/
|
||||
│ │ ├── 20221121.pdf
|
||||
│ │ └── 20221221.pdf
|
||||
│ └── 2023/
|
||||
│ ├── 20230123.pdf
|
||||
│ └── 20230221.pdf
|
||||
livret_a/
|
||||
├── 20230721.pdf
|
||||
└── 20240122.pdf
|
||||
```
|
@ -1,79 +1,106 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Depends on `pdftotext`.
|
||||
import os
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
from datetime import datetime
|
||||
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
|
||||
import operator
|
||||
from pprint import pprint
|
||||
|
||||
import os, subprocess, re, config
|
||||
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
|
||||
|
||||
path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
|
||||
os.chdir(PATH)
|
||||
|
||||
os.chdir(path)
|
||||
PRINT_TRANSACTIONS = False
|
||||
|
||||
'''
|
||||
Assuming file hierarchy like:
|
||||
MAIN_BANK_ACCOUNT = 'compte_de_cheques'
|
||||
|
||||
2022
|
||||
├── 20221121.pdf
|
||||
└── 20221221.pdf
|
||||
2023
|
||||
├── 20230123.pdf
|
||||
└── 20230221.pdf
|
||||
'''
|
||||
allTransactions = []
|
||||
monthlyTransactions = {}
|
||||
|
||||
def execute(command):
|
||||
return subprocess.check_output(command).decode('utf-8')
|
||||
|
||||
def getTextFromPdf(pdfPath):
|
||||
return execute(['pdftotext', '-raw', pdfPath, '-')
|
||||
|
||||
firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
|
||||
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
|
||||
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')
|
||||
def appendTransactions(transactions, bankAccount):
|
||||
global allTransactions
|
||||
for transaction in transactions:
|
||||
transaction['bank account'] = bankAccount
|
||||
allTransactions += [transaction]
|
||||
date = transaction['date'].replace(day = 1)
|
||||
monthlyTransactions[date] = monthlyTransactions.get(date, []) + [transaction]
|
||||
|
||||
for folder in os.listdir():
|
||||
for file in os.listdir(folder):
|
||||
#folder = '2022'
|
||||
#file = '20220321.pdf'
|
||||
if folder != MAIN_BANK_ACCOUNT:
|
||||
for file in os.listdir(folder):
|
||||
filePath = f'{folder}/{file}'
|
||||
print(filePath)
|
||||
transactions = readPdfBankStatement(filePath)[3]
|
||||
pprint(transactions)
|
||||
appendTransactions(transactions, folder)
|
||||
|
||||
os.chdir(f'{MAIN_BANK_ACCOUNT}/')
|
||||
|
||||
for folder in sorted(os.listdir()):
|
||||
for file in sorted(os.listdir(folder)):
|
||||
filePath = f'{folder}/{file}'
|
||||
print(filePath)
|
||||
content = getTextFromPdf(filePath)
|
||||
lines = content.splitlines()
|
||||
started = False
|
||||
firstPage = True
|
||||
initialAmount = None
|
||||
currentAmount = None
|
||||
date = None
|
||||
comment = []
|
||||
for line in lines:
|
||||
if not started:
|
||||
# We are interested in the content after this line:
|
||||
if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
|
||||
if soldeCrediteurAuRegex.match(line):
|
||||
initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
|
||||
currentAmount = initialAmount
|
||||
print('Initial amount', initialAmount)
|
||||
print()
|
||||
started = True
|
||||
continue
|
||||
else:
|
||||
# We aren't interested in the content after this line:
|
||||
if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
|
||||
firstPage = False
|
||||
started = False
|
||||
continue
|
||||
# We aren't interested in the content after this line
|
||||
elif line.startswith('TOTAL DES OPERATIONS'):
|
||||
break
|
||||
if firstLineOfPaymentRegex.match(line) is not None:
|
||||
if date is not None:
|
||||
print(date, valeur, amount, currentAmount)
|
||||
print('\n'.join(comment))
|
||||
print()
|
||||
date, valeur, amount = line.split()
|
||||
amount = float(amount.replace(',', '.'))
|
||||
currentAmount -= amount
|
||||
comment = []
|
||||
else:
|
||||
comment += [line]
|
||||
break
|
||||
break
|
||||
transactions = readPdfBankStatement(filePath)[3]
|
||||
appendTransactions(transactions, MAIN_BANK_ACCOUNT)
|
||||
if PRINT_TRANSACTIONS:
|
||||
pprint(transactions)
|
||||
#break
|
||||
#break
|
||||
|
||||
allTransactions.sort(key = operator.itemgetter('date'))
|
||||
print(len(allTransactions))
|
||||
#pprint(allTransactions)
|
||||
|
||||
import re
|
||||
VIRT_A_CPTE_EMIS_SUR_LE_REGEX = re.compile('VIRT CPTE A CPTE EMIS SUR LE\n(CEL|LEP|LVJ|L\.A|LDD)\\d{23}')
|
||||
|
||||
# Could precise bank account to restrict own account comments.
|
||||
def isTransactionFromOwnAccounts(comment):
|
||||
#if comment.startswith('DEPOT INITIAL DU COMPTE\n'):
|
||||
# print(comment)
|
||||
#if comment.startswith('VIR CPTE A CPTE EMIS /MOTIF '):
|
||||
# print(comment)
|
||||
return comment.startswith('DEPOT INITIAL DU COMPTE\n') or \
|
||||
comment.startswith('VIR CPTE A CPTE EMIS /MOTIF ') or \
|
||||
VIRT_A_CPTE_EMIS_SUR_LE_REGEX.match(comment)#comment.startswith('VIRT CPTE A CPTE EMIS SUR LE')
|
||||
#comment.startswith('VIR CPTE A CPTE RECU /DE ') or \
|
||||
# and comment.endswith('/REFDO /REFBEN')
|
||||
|
||||
#allTransactions = [transaction for transaction in allTransactions if not isTransactionFromOwnAccounts(transaction['comment'])]
|
||||
sortedMonths = sorted(monthlyTransactions.keys())
|
||||
for month in sortedMonths:
|
||||
monthlyTransactions[month] = [transaction for transaction in monthlyTransactions[month] if not isTransactionFromOwnAccounts(transaction['comment'])]
|
||||
totalMonthlyDebits = [sum([min(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||
totalMonthlyCredits = [sum([max(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||
totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||
totals = [monthlyTransactions[sortedMonths[0]][0]['current amount'] + sum(totalMonthlyDifferences[:monthIndex + 1]) for monthIndex in range(len(sortedMonths))]
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
plt.title('BNP accounts monthly debits and credits')
|
||||
plt.xlabel('Date')
|
||||
plt.ylabel('€')
|
||||
ALPHA = 0.5
|
||||
|
||||
xTicks = range(getMonthIndexSinceEpoch(sortedMonths[0]), getMonthIndexSinceEpoch(sortedMonths[-1]) + 1)
|
||||
totalMonthlyAmountAndLabel = (
|
||||
(totalMonthlyDebits, 'Debit'),
|
||||
(totalMonthlyCredits, 'Credit'),
|
||||
(totalMonthlyDifferences, 'Difference'),
|
||||
(totals, 'Total'),
|
||||
)
|
||||
for totalMonthlyAmount, totalMonthlyLabel in totalMonthlyAmountAndLabel:
|
||||
plt.bar(xTicks, totalMonthlyAmount, alpha = ALPHA, label = totalMonthlyLabel)
|
||||
plt.legend()
|
||||
|
||||
#plt.yscale('symlog')
|
||||
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}'))
|
||||
|
||||
ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks]
|
||||
plt.xticks(xTicks, ticksLabels, rotation = 90)
|
||||
#plt.tight_layout()
|
||||
# How to show the horizontal lines for subticks?
|
||||
plt.grid(axis = 'y')
|
||||
|
||||
plt.show()
|
119
utils.py
Normal file
119
utils.py
Normal file
@ -0,0 +1,119 @@
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
|
||||
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
|
||||
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+')
|
||||
SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
|
||||
TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
|
||||
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
|
||||
COLUMNS_HEADER = re.compile(' +Date +Nature des opérations +Valeur +Débit +Crédit')
|
||||
|
||||
def execute(command):
|
||||
return subprocess.check_output(command).decode('utf-8')
|
||||
|
||||
def getTextFromPdf(pdfPath):
|
||||
return execute(['pdftotext', '-layout', pdfPath, '-'])
|
||||
|
||||
def getDatetimeFromFileName(aDatetimeStr):
|
||||
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
|
||||
return aDatetime
|
||||
|
||||
def getMonthIndexSinceEpoch(aDatetime):
|
||||
return aDatetime.year * 12 + aDatetime.month
|
||||
|
||||
def getMonthNameFromMonthIndex(monthIndex):
|
||||
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
|
||||
|
||||
def toFloat(group):
|
||||
return float(group.replace(',', '.').replace(' ', ''))
|
||||
|
||||
def getDateFollowing(date, initialDate):
|
||||
date = datetime.strptime(date, '%d.%m').replace(year = initialDate.year)
|
||||
# To support new year.
|
||||
if date < initialDate:
|
||||
date = date.replace(year = date.year + 1)
|
||||
return date
|
||||
|
||||
def readPdfBankStatement(filePath):
|
||||
file = filePath.split('/')[-1]
|
||||
fileDatetime = getDatetimeFromFileName(file)
|
||||
content = getTextFromPdf(filePath)
|
||||
lines = content.splitlines()
|
||||
started = False
|
||||
firstPage = True
|
||||
initialAmount = None
|
||||
initialDate = None
|
||||
currentAmount = None
|
||||
date = None
|
||||
comment = []
|
||||
transactions = []
|
||||
debitIndex = None
|
||||
creditIndex = None
|
||||
for line in lines:
|
||||
if not started:
|
||||
# We are interested in the content after this line:)
|
||||
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX.fullmatch(line)
|
||||
if COLUMNS_HEADER.fullmatch(line) is not None:
|
||||
getIndex = lambda line, type_: line.index(type_) + len(type_)
|
||||
debitIndex = getIndex(line, 'Débit')
|
||||
creditIndex = getIndex(line, 'Crédit')
|
||||
if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.fullmatch(line) is not None and not firstPage):
|
||||
if soldeCrediteurAuRegexMatch is not None:
|
||||
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
|
||||
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
|
||||
print(f'{initialAmount=}')
|
||||
currentAmount = initialAmount
|
||||
started = True
|
||||
continue
|
||||
else:
|
||||
# We aren't interested in the content after this line:
|
||||
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.fullmatch(line) is not None:
|
||||
firstPage = False
|
||||
started = False
|
||||
continue
|
||||
# We aren't interested in the content after this line
|
||||
else:
|
||||
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.fullmatch(line)
|
||||
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.fullmatch(line)
|
||||
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
|
||||
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
|
||||
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
|
||||
if totalDesOperationsRegexMatch is not None:
|
||||
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
|
||||
else:
|
||||
totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
|
||||
totalMonthlyDebit = 0
|
||||
print(f'{totalMonthlyDebit=}')
|
||||
print(f'{totalMonthlyCredit=}')
|
||||
break
|
||||
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.fullmatch(line)
|
||||
if firstLineOfPaymentRegexMatch is not None:
|
||||
if date is not None:
|
||||
transactions += [{
|
||||
'date': getDateFollowing(date, initialDate),
|
||||
'valeur': getDateFollowing(valeur, initialDate),
|
||||
'amount': amount,
|
||||
'current amount': currentAmount,
|
||||
'comment': '\n'.join(comment)
|
||||
}]
|
||||
date = None
|
||||
date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
|
||||
lineLen = len(line)
|
||||
amount = toFloat(amount)
|
||||
if abs(debitIndex - lineLen) < abs(creditIndex - lineLen):
|
||||
amount *= -1
|
||||
currentAmount += amount
|
||||
comment = [firstCommentLine]
|
||||
elif line != '':
|
||||
comment += [line.strip()]
|
||||
if date is not None:
|
||||
transactions += [{
|
||||
'date': getDateFollowing(date, initialDate),
|
||||
'valeur': getDateFollowing(valeur, initialDate),
|
||||
'amount': amount,
|
||||
'current amount': currentAmount,
|
||||
'comment': '\n'.join(comment)
|
||||
}]
|
||||
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime
|
Loading…
Reference in New Issue
Block a user