Compare commits

..

No commits in common. "49690257222a19668ae50f701f038be945ba8d69" and "6f97554195b4f9de4d28782a706e5389ba756531" have entirely different histories.

3 changed files with 67 additions and 232 deletions

View File

@ -1,19 +0,0 @@
# BNP PDF statement parser
Depends on `pdftotext`.
Assuming file hierarchy like:
```
.
├── compte_de_cheques/
│ ├── 2022/
│ │ ├── 20221121.pdf
│ │ └── 20221221.pdf
│ └── 2023/
│ ├── 20230123.pdf
│ └── 20230221.pdf
livret_a/
├── 20230721.pdf
└── 20240122.pdf
```

View File

@ -1,106 +1,79 @@
#!/usr/bin/env python
#!/usr/bin/python3
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from datetime import datetime
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
import operator
from pprint import pprint
# Depends on `pdftotext`.
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
import os, subprocess, re, config
os.chdir(PATH)
path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
PRINT_TRANSACTIONS = False
os.chdir(path)
MAIN_BANK_ACCOUNT = 'compte_de_cheques'
'''
Assuming file hierarchy like:
allTransactions = []
monthlyTransactions = {}
2022
20221121.pdf
20221221.pdf
2023
20230123.pdf
20230221.pdf
'''
def appendTransactions(transactions, bankAccount):
global allTransactions
for transaction in transactions:
transaction['bank account'] = bankAccount
allTransactions += [transaction]
date = transaction['date'].replace(day = 1)
monthlyTransactions[date] = monthlyTransactions.get(date, []) + [transaction]
def execute(command):
return subprocess.check_output(command).decode('utf-8')
def getTextFromPdf(pdfPath):
return execute(['pdftotext', '-raw', pdfPath, '-')
firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')
for folder in os.listdir():
if folder != MAIN_BANK_ACCOUNT:
for file in os.listdir(folder):
#folder = '2022'
#file = '20220321.pdf'
filePath = f'{folder}/{file}'
print(filePath)
transactions = readPdfBankStatement(filePath)[3]
pprint(transactions)
appendTransactions(transactions, folder)
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
currentAmount = None
date = None
comment = []
for line in lines:
if not started:
# We are interested in the content after this line:
if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
if soldeCrediteurAuRegex.match(line):
initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
currentAmount = initialAmount
print('Initial amount', initialAmount)
print()
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line.startswith('TOTAL DES OPERATIONS'):
break
if firstLineOfPaymentRegex.match(line) is not None:
if date is not None:
print(date, valeur, amount, currentAmount)
print('\n'.join(comment))
print()
date, valeur, amount = line.split()
amount = float(amount.replace(',', '.'))
currentAmount -= amount
comment = []
else:
comment += [line]
break
break
os.chdir(f'{MAIN_BANK_ACCOUNT}/')
for folder in sorted(os.listdir()):
for file in sorted(os.listdir(folder)):
filePath = f'{folder}/{file}'
print(filePath)
transactions = readPdfBankStatement(filePath)[3]
appendTransactions(transactions, MAIN_BANK_ACCOUNT)
if PRINT_TRANSACTIONS:
pprint(transactions)
#break
#break
allTransactions.sort(key = operator.itemgetter('date'))
print(len(allTransactions))
#pprint(allTransactions)
import re
VIRT_A_CPTE_EMIS_SUR_LE_REGEX = re.compile('VIRT CPTE A CPTE EMIS SUR LE\n(CEL|LEP|LVJ|L\.A|LDD)\\d{23}')
# Could precise bank account to restrict own account comments.
def isTransactionFromOwnAccounts(comment):
#if comment.startswith('DEPOT INITIAL DU COMPTE\n'):
# print(comment)
#if comment.startswith('VIR CPTE A CPTE EMIS /MOTIF '):
# print(comment)
return comment.startswith('DEPOT INITIAL DU COMPTE\n') or \
comment.startswith('VIR CPTE A CPTE EMIS /MOTIF ') or \
VIRT_A_CPTE_EMIS_SUR_LE_REGEX.match(comment)#comment.startswith('VIRT CPTE A CPTE EMIS SUR LE')
#comment.startswith('VIR CPTE A CPTE RECU /DE ') or \
# and comment.endswith('/REFDO /REFBEN')
#allTransactions = [transaction for transaction in allTransactions if not isTransactionFromOwnAccounts(transaction['comment'])]
sortedMonths = sorted(monthlyTransactions.keys())
for month in sortedMonths:
monthlyTransactions[month] = [transaction for transaction in monthlyTransactions[month] if not isTransactionFromOwnAccounts(transaction['comment'])]
totalMonthlyDebits = [sum([min(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totalMonthlyCredits = [sum([max(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths]
totals = [monthlyTransactions[sortedMonths[0]][0]['current amount'] + sum(totalMonthlyDifferences[:monthIndex + 1]) for monthIndex in range(len(sortedMonths))]
fig, ax = plt.subplots()
plt.title('BNP accounts monthly debits and credits')
plt.xlabel('Date')
plt.ylabel('')
ALPHA = 0.5
xTicks = range(getMonthIndexSinceEpoch(sortedMonths[0]), getMonthIndexSinceEpoch(sortedMonths[-1]) + 1)
totalMonthlyAmountAndLabel = (
(totalMonthlyDebits, 'Debit'),
(totalMonthlyCredits, 'Credit'),
(totalMonthlyDifferences, 'Difference'),
(totals, 'Total'),
)
for totalMonthlyAmount, totalMonthlyLabel in totalMonthlyAmountAndLabel:
plt.bar(xTicks, totalMonthlyAmount, alpha = ALPHA, label = totalMonthlyLabel)
plt.legend()
#plt.yscale('symlog')
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}'))
ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks]
plt.xticks(xTicks, ticksLabels, rotation = 90)
#plt.tight_layout()
# How to show the horizontal lines for subticks?
plt.grid(axis = 'y')
plt.show()

119
utils.py
View File

@ -1,119 +0,0 @@
import subprocess
from datetime import datetime
import re
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+')
SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
COLUMNS_HEADER = re.compile(' +Date +Nature des opérations +Valeur +Débit +Crédit')
def execute(command):
return subprocess.check_output(command).decode('utf-8')
def getTextFromPdf(pdfPath):
return execute(['pdftotext', '-layout', pdfPath, '-'])
def getDatetimeFromFileName(aDatetimeStr):
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
return aDatetime
def getMonthIndexSinceEpoch(aDatetime):
return aDatetime.year * 12 + aDatetime.month
def getMonthNameFromMonthIndex(monthIndex):
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
def toFloat(group):
return float(group.replace(',', '.').replace(' ', ''))
def getDateFollowing(date, initialDate):
date = datetime.strptime(date, '%d.%m').replace(year = initialDate.year)
# To support new year.
if date < initialDate:
date = date.replace(year = date.year + 1)
return date
def readPdfBankStatement(filePath):
file = filePath.split('/')[-1]
fileDatetime = getDatetimeFromFileName(file)
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
initialDate = None
currentAmount = None
date = None
comment = []
transactions = []
debitIndex = None
creditIndex = None
for line in lines:
if not started:
# We are interested in the content after this line:)
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX.fullmatch(line)
if COLUMNS_HEADER.fullmatch(line) is not None:
getIndex = lambda line, type_: line.index(type_) + len(type_)
debitIndex = getIndex(line, 'Débit')
creditIndex = getIndex(line, 'Crédit')
if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.fullmatch(line) is not None and not firstPage):
if soldeCrediteurAuRegexMatch is not None:
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
print(f'{initialAmount=}')
currentAmount = initialAmount
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.fullmatch(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
else:
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.fullmatch(line)
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.fullmatch(line)
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
if totalDesOperationsRegexMatch is not None:
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
else:
totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
totalMonthlyDebit = 0
print(f'{totalMonthlyDebit=}')
print(f'{totalMonthlyCredit=}')
break
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.fullmatch(line)
if firstLineOfPaymentRegexMatch is not None:
if date is not None:
transactions += [{
'date': getDateFollowing(date, initialDate),
'valeur': getDateFollowing(valeur, initialDate),
'amount': amount,
'current amount': currentAmount,
'comment': '\n'.join(comment)
}]
date = None
date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
lineLen = len(line)
amount = toFloat(amount)
if abs(debitIndex - lineLen) < abs(creditIndex - lineLen):
amount *= -1
currentAmount += amount
comment = [firstCommentLine]
elif line != '':
comment += [line.strip()]
if date is not None:
transactions += [{
'date': getDateFollowing(date, initialDate),
'valeur': getDateFollowing(valeur, initialDate),
'amount': amount,
'current amount': currentAmount,
'comment': '\n'.join(comment)
}]
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime