Compare commits

...

22 Commits

Author SHA1 Message Date
4969025722
Replace re.match with re.fullmatch
See [Benjamin-Loison/cpython/issues/43](https://github.com/Benjamin-Loison/cpython/issues/43).
2024-10-16 20:03:32 +02:00
91fb433258
Add and use isTransactionFromOwnAccounts 2024-10-04 12:47:48 +02:00
db18d88f35
Correctly make all transactions match current balance 2024-10-04 02:04:55 +02:00
e19132c33f
Fix #2: Use amount sign to precise if it's a debit or a credit 2024-10-04 01:37:47 +02:00
e0c99bc068
Parse all relevés 2024-10-04 01:15:38 +02:00
2d4a10da05
#2: Use pdftotext -layout instead of pdftotext -raw to distinguish debit from credit 2024-10-04 00:53:33 +02:00
69d6442966
WIP about to plot with transaction precision 2024-10-03 23:47:26 +02:00
7d7ced9b81
WIP sort all transactions 2024-10-03 21:36:43 +02:00
9e8de4b4c7
WIP gather all transactions over bank accounts 2024-10-03 21:33:38 +02:00
046835064d
WIP transaction precise 2024-10-03 21:22:30 +02:00
31b81fa33e
Make date and valeur of transaction be datetimes 2024-10-03 21:20:07 +02:00
c425ebcb00
Correctly detect transactions with thousands 2024-10-03 20:36:39 +02:00
9eed49c344
Consider last transaction 2024-10-03 20:10:06 +02:00
4123118020
Add and use readPdfBankStatement 2024-10-03 19:51:54 +02:00
b99838696e
WIP adding other bank accounts 2024-10-03 19:33:54 +02:00
bbe93d0939
Move stuff from main.py to README.md and utils.py 2024-10-03 18:07:03 +02:00
17acc478b7
Uppercase constants 2024-10-03 17:43:44 +02:00
ee6e850174
Specify compte_de_cheques/ 2024-10-03 17:36:07 +02:00
f090374caf
Use symlog scale instead of log 2024-10-02 00:33:11 +02:00
ef3e8ef337
Add logarithmic scale 2024-10-01 20:57:36 +02:00
b75d5fc86f
Add WIP monthly debit and credit plot 2024-10-01 20:35:43 +02:00
480072779c
Remove config dependency 2023-11-05 21:10:45 +01:00
3 changed files with 232 additions and 67 deletions

19
README.md Normal file
View File

@ -0,0 +1,19 @@
# BNP PDF statement parser
Depends on `pdftotext`.
Assuming file hierarchy like:
```
.
├── compte_de_cheques/
│ ├── 2022/
│ │ ├── 20221121.pdf
│ │ └── 20221221.pdf
│ └── 2023/
│ ├── 20230123.pdf
│ └── 20230221.pdf
livret_a/
├── 20230721.pdf
└── 20240122.pdf
```

View File

@ -1,79 +1,106 @@
#!/usr/bin/python3
#!/usr/bin/env python
# Depends on `pdftotext`.
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from datetime import datetime
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
import operator
from pprint import pprint
import os, subprocess, re, config
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
os.chdir(PATH)
os.chdir(path)
PRINT_TRANSACTIONS = False
'''
Assuming file hierarchy like:
MAIN_BANK_ACCOUNT = 'compte_de_cheques'
2022
20221121.pdf
20221221.pdf
2023
20230123.pdf
20230221.pdf
'''
allTransactions = []
monthlyTransactions = {}
def execute(command):
return subprocess.check_output(command).decode('utf-8')
def getTextFromPdf(pdfPath):
return execute(['pdftotext', '-raw', pdfPath, '-')
firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')
def appendTransactions(transactions, bankAccount):
global allTransactions
for transaction in transactions:
transaction['bank account'] = bankAccount
allTransactions += [transaction]
date = transaction['date'].replace(day = 1)
monthlyTransactions[date] = monthlyTransactions.get(date, []) + [transaction]
for folder in os.listdir():
if folder != MAIN_BANK_ACCOUNT:
for file in os.listdir(folder):
#folder = '2022'
#file = '20220321.pdf'
filePath = f'{folder}/{file}'
print(filePath)
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
currentAmount = None
date = None
comment = []
for line in lines:
if not started:
# We are interested in the content after this line:
if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
if soldeCrediteurAuRegex.match(line):
initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
currentAmount = initialAmount
print('Initial amount', initialAmount)
print()
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line.startswith('TOTAL DES OPERATIONS'):
break
if firstLineOfPaymentRegex.match(line) is not None:
if date is not None:
print(date, valeur, amount, currentAmount)
print('\n'.join(comment))
print()
date, valeur, amount = line.split()
amount = float(amount.replace(',', '.'))
currentAmount -= amount
comment = []
else:
comment += [line]
break
break
transactions = readPdfBankStatement(filePath)[3]
pprint(transactions)
appendTransactions(transactions, folder)
os.chdir(f'{MAIN_BANK_ACCOUNT}/')
for folder in sorted(os.listdir()):
for file in sorted(os.listdir(folder)):
filePath = f'{folder}/{file}'
print(filePath)
transactions = readPdfBankStatement(filePath)[3]
appendTransactions(transactions, MAIN_BANK_ACCOUNT)
if PRINT_TRANSACTIONS:
pprint(transactions)
#break
#break
allTransactions.sort(key = operator.itemgetter('date'))
print(len(allTransactions))
#pprint(allTransactions)
import re
VIRT_A_CPTE_EMIS_SUR_LE_REGEX = re.compile('VIRT CPTE A CPTE EMIS SUR LE\n(CEL|LEP|LVJ|L\.A|LDD)\\d{23}')
# Could precise bank account to restrict own account comments.
def isTransactionFromOwnAccounts(comment):
#if comment.startswith('DEPOT INITIAL DU COMPTE\n'):
# print(comment)
#if comment.startswith('VIR CPTE A CPTE EMIS /MOTIF '):
# print(comment)
return comment.startswith('DEPOT INITIAL DU COMPTE\n') or \
comment.startswith('VIR CPTE A CPTE EMIS /MOTIF ') or \
VIRT_A_CPTE_EMIS_SUR_LE_REGEX.match(comment)#comment.startswith('VIRT CPTE A CPTE EMIS SUR LE')
#comment.startswith('VIR CPTE A CPTE RECU /DE ') or \
# and comment.endswith('/REFDO /REFBEN')
#allTransactions = [transaction for transaction in allTransactions if not isTransactionFromOwnAccounts(transaction['comment'])]
sortedMonths = sorted(monthlyTransactions.keys())
for month in sortedMonths:
monthlyTransactions[month] = [transaction for transaction in monthlyTransactions[month] if not isTransactionFromOwnAccounts(transaction['comment'])]
totalMonthlyDebits = [sum([min(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totalMonthlyCredits = [sum([max(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totals = [monthlyTransactions[sortedMonths[0]][0]['current amount'] + sum(totalMonthlyDifferences[:monthIndex + 1]) for monthIndex in range(len(sortedMonths))]
fig, ax = plt.subplots()
plt.title('BNP accounts monthly debits and credits')
plt.xlabel('Date')
plt.ylabel('')
ALPHA = 0.5
xTicks = range(getMonthIndexSinceEpoch(sortedMonths[0]), getMonthIndexSinceEpoch(sortedMonths[-1]) + 1)
totalMonthlyAmountAndLabel = (
(totalMonthlyDebits, 'Debit'),
(totalMonthlyCredits, 'Credit'),
(totalMonthlyDifferences, 'Difference'),
(totals, 'Total'),
)
for totalMonthlyAmount, totalMonthlyLabel in totalMonthlyAmountAndLabel:
plt.bar(xTicks, totalMonthlyAmount, alpha = ALPHA, label = totalMonthlyLabel)
plt.legend()
#plt.yscale('symlog')
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}'))
ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks]
plt.xticks(xTicks, ticksLabels, rotation = 90)
#plt.tight_layout()
# How to show the horizontal lines for subticks?
plt.grid(axis = 'y')
plt.show()

119
utils.py Normal file
View File

@ -0,0 +1,119 @@
import subprocess
from datetime import datetime
import re
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+')
SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
COLUMNS_HEADER = re.compile(' +Date +Nature des opérations +Valeur +Débit +Crédit')
def execute(command):
return subprocess.check_output(command).decode('utf-8')
def getTextFromPdf(pdfPath):
return execute(['pdftotext', '-layout', pdfPath, '-'])
def getDatetimeFromFileName(aDatetimeStr):
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
return aDatetime
def getMonthIndexSinceEpoch(aDatetime):
return aDatetime.year * 12 + aDatetime.month
def getMonthNameFromMonthIndex(monthIndex):
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
def toFloat(group):
return float(group.replace(',', '.').replace(' ', ''))
def getDateFollowing(date, initialDate):
date = datetime.strptime(date, '%d.%m').replace(year = initialDate.year)
# To support new year.
if date < initialDate:
date = date.replace(year = date.year + 1)
return date
def readPdfBankStatement(filePath):
file = filePath.split('/')[-1]
fileDatetime = getDatetimeFromFileName(file)
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
initialDate = None
currentAmount = None
date = None
comment = []
transactions = []
debitIndex = None
creditIndex = None
for line in lines:
if not started:
# We are interested in the content after this line:)
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX.fullmatch(line)
if COLUMNS_HEADER.fullmatch(line) is not None:
getIndex = lambda line, type_: line.index(type_) + len(type_)
debitIndex = getIndex(line, 'Débit')
creditIndex = getIndex(line, 'Crédit')
if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.fullmatch(line) is not None and not firstPage):
if soldeCrediteurAuRegexMatch is not None:
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
print(f'{initialAmount=}')
currentAmount = initialAmount
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.fullmatch(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
else:
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.fullmatch(line)
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.fullmatch(line)
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
if totalDesOperationsRegexMatch is not None:
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
else:
totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
totalMonthlyDebit = 0
print(f'{totalMonthlyDebit=}')
print(f'{totalMonthlyCredit=}')
break
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.fullmatch(line)
if firstLineOfPaymentRegexMatch is not None:
if date is not None:
transactions += [{
'date': getDateFollowing(date, initialDate),
'valeur': getDateFollowing(valeur, initialDate),
'amount': amount,
'current amount': currentAmount,
'comment': '\n'.join(comment)
}]
date = None
date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
lineLen = len(line)
amount = toFloat(amount)
if abs(debitIndex - lineLen) < abs(creditIndex - lineLen):
amount *= -1
currentAmount += amount
comment = [firstCommentLine]
elif line != '':
comment += [line.strip()]
if date is not None:
transactions += [{
'date': getDateFollowing(date, initialDate),
'valeur': getDateFollowing(valeur, initialDate),
'amount': amount,
'current amount': currentAmount,
'comment': '\n'.join(comment)
}]
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime