Compare commits
22 Commits
6f97554195
...
4969025722
Author | SHA1 | Date | |
---|---|---|---|
4969025722 | |||
91fb433258 | |||
db18d88f35 | |||
e19132c33f | |||
e0c99bc068 | |||
2d4a10da05 | |||
69d6442966 | |||
7d7ced9b81 | |||
9e8de4b4c7 | |||
046835064d | |||
31b81fa33e | |||
c425ebcb00 | |||
9eed49c344 | |||
4123118020 | |||
b99838696e | |||
bbe93d0939 | |||
17acc478b7 | |||
ee6e850174 | |||
f090374caf | |||
ef3e8ef337 | |||
b75d5fc86f | |||
480072779c |
19
README.md
Normal file
19
README.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# BNP PDF statement parser
|
||||||
|
|
||||||
|
Depends on `pdftotext`.
|
||||||
|
|
||||||
|
Assuming file hierarchy like:
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── compte_de_cheques/
|
||||||
|
│ ├── 2022/
|
||||||
|
│ │ ├── 20221121.pdf
|
||||||
|
│ │ └── 20221221.pdf
|
||||||
|
│ └── 2023/
|
||||||
|
│ ├── 20230123.pdf
|
||||||
|
│ └── 20230221.pdf
|
||||||
|
livret_a/
|
||||||
|
├── 20230721.pdf
|
||||||
|
└── 20240122.pdf
|
||||||
|
```
|
@ -1,79 +1,106 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# Depends on `pdftotext`.
|
import os
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.ticker as ticker
|
||||||
|
from datetime import datetime
|
||||||
|
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
|
||||||
|
import operator
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
import os, subprocess, re, config
|
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
|
||||||
|
|
||||||
path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
|
os.chdir(PATH)
|
||||||
|
|
||||||
os.chdir(path)
|
PRINT_TRANSACTIONS = False
|
||||||
|
|
||||||
'''
|
MAIN_BANK_ACCOUNT = 'compte_de_cheques'
|
||||||
Assuming file hierarchy like:
|
|
||||||
|
|
||||||
2022
|
allTransactions = []
|
||||||
├── 20221121.pdf
|
monthlyTransactions = {}
|
||||||
└── 20221221.pdf
|
|
||||||
2023
|
|
||||||
├── 20230123.pdf
|
|
||||||
└── 20230221.pdf
|
|
||||||
'''
|
|
||||||
|
|
||||||
def execute(command):
|
def appendTransactions(transactions, bankAccount):
|
||||||
return subprocess.check_output(command).decode('utf-8')
|
global allTransactions
|
||||||
|
for transaction in transactions:
|
||||||
def getTextFromPdf(pdfPath):
|
transaction['bank account'] = bankAccount
|
||||||
return execute(['pdftotext', '-raw', pdfPath, '-')
|
allTransactions += [transaction]
|
||||||
|
date = transaction['date'].replace(day = 1)
|
||||||
firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
|
monthlyTransactions[date] = monthlyTransactions.get(date, []) + [transaction]
|
||||||
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
|
|
||||||
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')
|
|
||||||
|
|
||||||
for folder in os.listdir():
|
for folder in os.listdir():
|
||||||
for file in os.listdir(folder):
|
if folder != MAIN_BANK_ACCOUNT:
|
||||||
#folder = '2022'
|
for file in os.listdir(folder):
|
||||||
#file = '20220321.pdf'
|
filePath = f'{folder}/{file}'
|
||||||
|
print(filePath)
|
||||||
|
transactions = readPdfBankStatement(filePath)[3]
|
||||||
|
pprint(transactions)
|
||||||
|
appendTransactions(transactions, folder)
|
||||||
|
|
||||||
|
os.chdir(f'{MAIN_BANK_ACCOUNT}/')
|
||||||
|
|
||||||
|
for folder in sorted(os.listdir()):
|
||||||
|
for file in sorted(os.listdir(folder)):
|
||||||
filePath = f'{folder}/{file}'
|
filePath = f'{folder}/{file}'
|
||||||
print(filePath)
|
print(filePath)
|
||||||
content = getTextFromPdf(filePath)
|
transactions = readPdfBankStatement(filePath)[3]
|
||||||
lines = content.splitlines()
|
appendTransactions(transactions, MAIN_BANK_ACCOUNT)
|
||||||
started = False
|
if PRINT_TRANSACTIONS:
|
||||||
firstPage = True
|
pprint(transactions)
|
||||||
initialAmount = None
|
#break
|
||||||
currentAmount = None
|
#break
|
||||||
date = None
|
|
||||||
comment = []
|
|
||||||
for line in lines:
|
|
||||||
if not started:
|
|
||||||
# We are interested in the content after this line:
|
|
||||||
if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
|
|
||||||
if soldeCrediteurAuRegex.match(line):
|
|
||||||
initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
|
|
||||||
currentAmount = initialAmount
|
|
||||||
print('Initial amount', initialAmount)
|
|
||||||
print()
|
|
||||||
started = True
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# We aren't interested in the content after this line:
|
|
||||||
if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
|
|
||||||
firstPage = False
|
|
||||||
started = False
|
|
||||||
continue
|
|
||||||
# We aren't interested in the content after this line
|
|
||||||
elif line.startswith('TOTAL DES OPERATIONS'):
|
|
||||||
break
|
|
||||||
if firstLineOfPaymentRegex.match(line) is not None:
|
|
||||||
if date is not None:
|
|
||||||
print(date, valeur, amount, currentAmount)
|
|
||||||
print('\n'.join(comment))
|
|
||||||
print()
|
|
||||||
date, valeur, amount = line.split()
|
|
||||||
amount = float(amount.replace(',', '.'))
|
|
||||||
currentAmount -= amount
|
|
||||||
comment = []
|
|
||||||
else:
|
|
||||||
comment += [line]
|
|
||||||
break
|
|
||||||
break
|
|
||||||
|
|
||||||
|
allTransactions.sort(key = operator.itemgetter('date'))
|
||||||
|
print(len(allTransactions))
|
||||||
|
#pprint(allTransactions)
|
||||||
|
|
||||||
|
import re
|
||||||
|
VIRT_A_CPTE_EMIS_SUR_LE_REGEX = re.compile('VIRT CPTE A CPTE EMIS SUR LE\n(CEL|LEP|LVJ|L\.A|LDD)\\d{23}')
|
||||||
|
|
||||||
|
# Could precise bank account to restrict own account comments.
|
||||||
|
def isTransactionFromOwnAccounts(comment):
|
||||||
|
#if comment.startswith('DEPOT INITIAL DU COMPTE\n'):
|
||||||
|
# print(comment)
|
||||||
|
#if comment.startswith('VIR CPTE A CPTE EMIS /MOTIF '):
|
||||||
|
# print(comment)
|
||||||
|
return comment.startswith('DEPOT INITIAL DU COMPTE\n') or \
|
||||||
|
comment.startswith('VIR CPTE A CPTE EMIS /MOTIF ') or \
|
||||||
|
VIRT_A_CPTE_EMIS_SUR_LE_REGEX.match(comment)#comment.startswith('VIRT CPTE A CPTE EMIS SUR LE')
|
||||||
|
#comment.startswith('VIR CPTE A CPTE RECU /DE ') or \
|
||||||
|
# and comment.endswith('/REFDO /REFBEN')
|
||||||
|
|
||||||
|
#allTransactions = [transaction for transaction in allTransactions if not isTransactionFromOwnAccounts(transaction['comment'])]
|
||||||
|
sortedMonths = sorted(monthlyTransactions.keys())
|
||||||
|
for month in sortedMonths:
|
||||||
|
monthlyTransactions[month] = [transaction for transaction in monthlyTransactions[month] if not isTransactionFromOwnAccounts(transaction['comment'])]
|
||||||
|
totalMonthlyDebits = [sum([min(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||||
|
totalMonthlyCredits = [sum([max(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||||
|
totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
||||||
|
totals = [monthlyTransactions[sortedMonths[0]][0]['current amount'] + sum(totalMonthlyDifferences[:monthIndex + 1]) for monthIndex in range(len(sortedMonths))]
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
plt.title('BNP accounts monthly debits and credits')
|
||||||
|
plt.xlabel('Date')
|
||||||
|
plt.ylabel('€')
|
||||||
|
ALPHA = 0.5
|
||||||
|
|
||||||
|
xTicks = range(getMonthIndexSinceEpoch(sortedMonths[0]), getMonthIndexSinceEpoch(sortedMonths[-1]) + 1)
|
||||||
|
totalMonthlyAmountAndLabel = (
|
||||||
|
(totalMonthlyDebits, 'Debit'),
|
||||||
|
(totalMonthlyCredits, 'Credit'),
|
||||||
|
(totalMonthlyDifferences, 'Difference'),
|
||||||
|
(totals, 'Total'),
|
||||||
|
)
|
||||||
|
for totalMonthlyAmount, totalMonthlyLabel in totalMonthlyAmountAndLabel:
|
||||||
|
plt.bar(xTicks, totalMonthlyAmount, alpha = ALPHA, label = totalMonthlyLabel)
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
#plt.yscale('symlog')
|
||||||
|
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}'))
|
||||||
|
|
||||||
|
ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks]
|
||||||
|
plt.xticks(xTicks, ticksLabels, rotation = 90)
|
||||||
|
#plt.tight_layout()
|
||||||
|
# How to show the horizontal lines for subticks?
|
||||||
|
plt.grid(axis = 'y')
|
||||||
|
|
||||||
|
plt.show()
|
119
utils.py
Normal file
119
utils.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
|
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
|
||||||
|
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
|
||||||
|
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+')
|
||||||
|
SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
|
||||||
|
TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
|
||||||
|
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
|
||||||
|
COLUMNS_HEADER = re.compile(' +Date +Nature des opérations +Valeur +Débit +Crédit')
|
||||||
|
|
||||||
|
def execute(command):
|
||||||
|
return subprocess.check_output(command).decode('utf-8')
|
||||||
|
|
||||||
|
def getTextFromPdf(pdfPath):
|
||||||
|
return execute(['pdftotext', '-layout', pdfPath, '-'])
|
||||||
|
|
||||||
|
def getDatetimeFromFileName(aDatetimeStr):
|
||||||
|
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
|
||||||
|
return aDatetime
|
||||||
|
|
||||||
|
def getMonthIndexSinceEpoch(aDatetime):
|
||||||
|
return aDatetime.year * 12 + aDatetime.month
|
||||||
|
|
||||||
|
def getMonthNameFromMonthIndex(monthIndex):
|
||||||
|
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
|
||||||
|
|
||||||
|
def toFloat(group):
|
||||||
|
return float(group.replace(',', '.').replace(' ', ''))
|
||||||
|
|
||||||
|
def getDateFollowing(date, initialDate):
|
||||||
|
date = datetime.strptime(date, '%d.%m').replace(year = initialDate.year)
|
||||||
|
# To support new year.
|
||||||
|
if date < initialDate:
|
||||||
|
date = date.replace(year = date.year + 1)
|
||||||
|
return date
|
||||||
|
|
||||||
|
def readPdfBankStatement(filePath):
|
||||||
|
file = filePath.split('/')[-1]
|
||||||
|
fileDatetime = getDatetimeFromFileName(file)
|
||||||
|
content = getTextFromPdf(filePath)
|
||||||
|
lines = content.splitlines()
|
||||||
|
started = False
|
||||||
|
firstPage = True
|
||||||
|
initialAmount = None
|
||||||
|
initialDate = None
|
||||||
|
currentAmount = None
|
||||||
|
date = None
|
||||||
|
comment = []
|
||||||
|
transactions = []
|
||||||
|
debitIndex = None
|
||||||
|
creditIndex = None
|
||||||
|
for line in lines:
|
||||||
|
if not started:
|
||||||
|
# We are interested in the content after this line:)
|
||||||
|
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX.fullmatch(line)
|
||||||
|
if COLUMNS_HEADER.fullmatch(line) is not None:
|
||||||
|
getIndex = lambda line, type_: line.index(type_) + len(type_)
|
||||||
|
debitIndex = getIndex(line, 'Débit')
|
||||||
|
creditIndex = getIndex(line, 'Crédit')
|
||||||
|
if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.fullmatch(line) is not None and not firstPage):
|
||||||
|
if soldeCrediteurAuRegexMatch is not None:
|
||||||
|
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
|
||||||
|
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
|
||||||
|
print(f'{initialAmount=}')
|
||||||
|
currentAmount = initialAmount
|
||||||
|
started = True
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# We aren't interested in the content after this line:
|
||||||
|
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.fullmatch(line) is not None:
|
||||||
|
firstPage = False
|
||||||
|
started = False
|
||||||
|
continue
|
||||||
|
# We aren't interested in the content after this line
|
||||||
|
else:
|
||||||
|
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.fullmatch(line)
|
||||||
|
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.fullmatch(line)
|
||||||
|
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
|
||||||
|
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
|
||||||
|
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
|
||||||
|
if totalDesOperationsRegexMatch is not None:
|
||||||
|
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
|
||||||
|
else:
|
||||||
|
totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
|
||||||
|
totalMonthlyDebit = 0
|
||||||
|
print(f'{totalMonthlyDebit=}')
|
||||||
|
print(f'{totalMonthlyCredit=}')
|
||||||
|
break
|
||||||
|
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.fullmatch(line)
|
||||||
|
if firstLineOfPaymentRegexMatch is not None:
|
||||||
|
if date is not None:
|
||||||
|
transactions += [{
|
||||||
|
'date': getDateFollowing(date, initialDate),
|
||||||
|
'valeur': getDateFollowing(valeur, initialDate),
|
||||||
|
'amount': amount,
|
||||||
|
'current amount': currentAmount,
|
||||||
|
'comment': '\n'.join(comment)
|
||||||
|
}]
|
||||||
|
date = None
|
||||||
|
date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
|
||||||
|
lineLen = len(line)
|
||||||
|
amount = toFloat(amount)
|
||||||
|
if abs(debitIndex - lineLen) < abs(creditIndex - lineLen):
|
||||||
|
amount *= -1
|
||||||
|
currentAmount += amount
|
||||||
|
comment = [firstCommentLine]
|
||||||
|
elif line != '':
|
||||||
|
comment += [line.strip()]
|
||||||
|
if date is not None:
|
||||||
|
transactions += [{
|
||||||
|
'date': getDateFollowing(date, initialDate),
|
||||||
|
'valeur': getDateFollowing(valeur, initialDate),
|
||||||
|
'amount': amount,
|
||||||
|
'current amount': currentAmount,
|
||||||
|
'comment': '\n'.join(comment)
|
||||||
|
}]
|
||||||
|
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime
|
Loading…
Reference in New Issue
Block a user