Compare commits
No commits in common. "49690257222a19668ae50f701f038be945ba8d69" and "6f97554195b4f9de4d28782a706e5389ba756531" have entirely different histories.
4969025722
...
6f97554195
19
README.md
19
README.md
@ -1,19 +0,0 @@
|
|||||||
# BNP PDF statement parser
|
|
||||||
|
|
||||||
Depends on `pdftotext`.
|
|
||||||
|
|
||||||
Assuming file hierarchy like:
|
|
||||||
|
|
||||||
```
|
|
||||||
.
|
|
||||||
├── compte_de_cheques/
|
|
||||||
│ ├── 2022/
|
|
||||||
│ │ ├── 20221121.pdf
|
|
||||||
│ │ └── 20221221.pdf
|
|
||||||
│ └── 2023/
|
|
||||||
│ ├── 20230123.pdf
|
|
||||||
│ └── 20230221.pdf
|
|
||||||
livret_a/
|
|
||||||
├── 20230721.pdf
|
|
||||||
└── 20240122.pdf
|
|
||||||
```
|
|
@ -1,106 +1,79 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import os
|
# Depends on `pdftotext`.
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib.ticker as ticker
|
|
||||||
from datetime import datetime
|
|
||||||
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
|
|
||||||
import operator
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
|
import os, subprocess, re, config
|
||||||
|
|
||||||
os.chdir(PATH)
|
path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
|
||||||
|
|
||||||
PRINT_TRANSACTIONS = False
|
os.chdir(path)
|
||||||
|
|
||||||
MAIN_BANK_ACCOUNT = 'compte_de_cheques'
|
'''
|
||||||
|
Assuming file hierarchy like:
|
||||||
|
|
||||||
allTransactions = []
|
2022
|
||||||
monthlyTransactions = {}
|
├── 20221121.pdf
|
||||||
|
└── 20221221.pdf
|
||||||
|
2023
|
||||||
|
├── 20230123.pdf
|
||||||
|
└── 20230221.pdf
|
||||||
|
'''
|
||||||
|
|
||||||
def appendTransactions(transactions, bankAccount):
|
def execute(command):
|
||||||
global allTransactions
|
return subprocess.check_output(command).decode('utf-8')
|
||||||
for transaction in transactions:
|
|
||||||
transaction['bank account'] = bankAccount
|
def getTextFromPdf(pdfPath):
|
||||||
allTransactions += [transaction]
|
return execute(['pdftotext', '-raw', pdfPath, '-')
|
||||||
date = transaction['date'].replace(day = 1)
|
|
||||||
monthlyTransactions[date] = monthlyTransactions.get(date, []) + [transaction]
|
firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
|
||||||
|
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
|
||||||
|
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')
|
||||||
|
|
||||||
for folder in os.listdir():
|
for folder in os.listdir():
|
||||||
if folder != MAIN_BANK_ACCOUNT:
|
|
||||||
for file in os.listdir(folder):
|
for file in os.listdir(folder):
|
||||||
|
#folder = '2022'
|
||||||
|
#file = '20220321.pdf'
|
||||||
filePath = f'{folder}/{file}'
|
filePath = f'{folder}/{file}'
|
||||||
print(filePath)
|
print(filePath)
|
||||||
transactions = readPdfBankStatement(filePath)[3]
|
content = getTextFromPdf(filePath)
|
||||||
pprint(transactions)
|
lines = content.splitlines()
|
||||||
appendTransactions(transactions, folder)
|
started = False
|
||||||
|
firstPage = True
|
||||||
|
initialAmount = None
|
||||||
|
currentAmount = None
|
||||||
|
date = None
|
||||||
|
comment = []
|
||||||
|
for line in lines:
|
||||||
|
if not started:
|
||||||
|
# We are interested in the content after this line:
|
||||||
|
if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
|
||||||
|
if soldeCrediteurAuRegex.match(line):
|
||||||
|
initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
|
||||||
|
currentAmount = initialAmount
|
||||||
|
print('Initial amount', initialAmount)
|
||||||
|
print()
|
||||||
|
started = True
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# We aren't interested in the content after this line:
|
||||||
|
if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
|
||||||
|
firstPage = False
|
||||||
|
started = False
|
||||||
|
continue
|
||||||
|
# We aren't interested in the content after this line
|
||||||
|
elif line.startswith('TOTAL DES OPERATIONS'):
|
||||||
|
break
|
||||||
|
if firstLineOfPaymentRegex.match(line) is not None:
|
||||||
|
if date is not None:
|
||||||
|
print(date, valeur, amount, currentAmount)
|
||||||
|
print('\n'.join(comment))
|
||||||
|
print()
|
||||||
|
date, valeur, amount = line.split()
|
||||||
|
amount = float(amount.replace(',', '.'))
|
||||||
|
currentAmount -= amount
|
||||||
|
comment = []
|
||||||
|
else:
|
||||||
|
comment += [line]
|
||||||
|
break
|
||||||
|
break
|
||||||
|
|
||||||
os.chdir(f'{MAIN_BANK_ACCOUNT}/')
|
|
||||||
|
|
||||||
for folder in sorted(os.listdir()):
|
|
||||||
for file in sorted(os.listdir(folder)):
|
|
||||||
filePath = f'{folder}/{file}'
|
|
||||||
print(filePath)
|
|
||||||
transactions = readPdfBankStatement(filePath)[3]
|
|
||||||
appendTransactions(transactions, MAIN_BANK_ACCOUNT)
|
|
||||||
if PRINT_TRANSACTIONS:
|
|
||||||
pprint(transactions)
|
|
||||||
#break
|
|
||||||
#break
|
|
||||||
|
|
||||||
allTransactions.sort(key = operator.itemgetter('date'))
|
|
||||||
print(len(allTransactions))
|
|
||||||
#pprint(allTransactions)
|
|
||||||
|
|
||||||
import re
|
|
||||||
VIRT_A_CPTE_EMIS_SUR_LE_REGEX = re.compile('VIRT CPTE A CPTE EMIS SUR LE\n(CEL|LEP|LVJ|L\.A|LDD)\\d{23}')
|
|
||||||
|
|
||||||
# Could precise bank account to restrict own account comments.
|
|
||||||
def isTransactionFromOwnAccounts(comment):
|
|
||||||
#if comment.startswith('DEPOT INITIAL DU COMPTE\n'):
|
|
||||||
# print(comment)
|
|
||||||
#if comment.startswith('VIR CPTE A CPTE EMIS /MOTIF '):
|
|
||||||
# print(comment)
|
|
||||||
return comment.startswith('DEPOT INITIAL DU COMPTE\n') or \
|
|
||||||
comment.startswith('VIR CPTE A CPTE EMIS /MOTIF ') or \
|
|
||||||
VIRT_A_CPTE_EMIS_SUR_LE_REGEX.match(comment)#comment.startswith('VIRT CPTE A CPTE EMIS SUR LE')
|
|
||||||
#comment.startswith('VIR CPTE A CPTE RECU /DE ') or \
|
|
||||||
# and comment.endswith('/REFDO /REFBEN')
|
|
||||||
|
|
||||||
#allTransactions = [transaction for transaction in allTransactions if not isTransactionFromOwnAccounts(transaction['comment'])]
|
|
||||||
sortedMonths = sorted(monthlyTransactions.keys())
|
|
||||||
for month in sortedMonths:
|
|
||||||
monthlyTransactions[month] = [transaction for transaction in monthlyTransactions[month] if not isTransactionFromOwnAccounts(transaction['comment'])]
|
|
||||||
totalMonthlyDebits = [sum([min(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
|
||||||
totalMonthlyCredits = [sum([max(transaction['amount'], 0) for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
|
||||||
totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths]
|
|
||||||
totals = [monthlyTransactions[sortedMonths[0]][0]['current amount'] + sum(totalMonthlyDifferences[:monthIndex + 1]) for monthIndex in range(len(sortedMonths))]
|
|
||||||
|
|
||||||
fig, ax = plt.subplots()
|
|
||||||
plt.title('BNP accounts monthly debits and credits')
|
|
||||||
plt.xlabel('Date')
|
|
||||||
plt.ylabel('€')
|
|
||||||
ALPHA = 0.5
|
|
||||||
|
|
||||||
xTicks = range(getMonthIndexSinceEpoch(sortedMonths[0]), getMonthIndexSinceEpoch(sortedMonths[-1]) + 1)
|
|
||||||
totalMonthlyAmountAndLabel = (
|
|
||||||
(totalMonthlyDebits, 'Debit'),
|
|
||||||
(totalMonthlyCredits, 'Credit'),
|
|
||||||
(totalMonthlyDifferences, 'Difference'),
|
|
||||||
(totals, 'Total'),
|
|
||||||
)
|
|
||||||
for totalMonthlyAmount, totalMonthlyLabel in totalMonthlyAmountAndLabel:
|
|
||||||
plt.bar(xTicks, totalMonthlyAmount, alpha = ALPHA, label = totalMonthlyLabel)
|
|
||||||
plt.legend()
|
|
||||||
|
|
||||||
#plt.yscale('symlog')
|
|
||||||
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}'))
|
|
||||||
|
|
||||||
ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks]
|
|
||||||
plt.xticks(xTicks, ticksLabels, rotation = 90)
|
|
||||||
#plt.tight_layout()
|
|
||||||
# How to show the horizontal lines for subticks?
|
|
||||||
plt.grid(axis = 'y')
|
|
||||||
|
|
||||||
plt.show()
|
|
119
utils.py
119
utils.py
@ -1,119 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
from datetime import datetime
|
|
||||||
import re
|
|
||||||
|
|
||||||
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
|
|
||||||
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
|
|
||||||
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+')
|
|
||||||
SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
|
|
||||||
TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
|
|
||||||
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
|
|
||||||
COLUMNS_HEADER = re.compile(' +Date +Nature des opérations +Valeur +Débit +Crédit')
|
|
||||||
|
|
||||||
def execute(command):
|
|
||||||
return subprocess.check_output(command).decode('utf-8')
|
|
||||||
|
|
||||||
def getTextFromPdf(pdfPath):
|
|
||||||
return execute(['pdftotext', '-layout', pdfPath, '-'])
|
|
||||||
|
|
||||||
def getDatetimeFromFileName(aDatetimeStr):
|
|
||||||
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
|
|
||||||
return aDatetime
|
|
||||||
|
|
||||||
def getMonthIndexSinceEpoch(aDatetime):
|
|
||||||
return aDatetime.year * 12 + aDatetime.month
|
|
||||||
|
|
||||||
def getMonthNameFromMonthIndex(monthIndex):
|
|
||||||
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
|
|
||||||
|
|
||||||
def toFloat(group):
|
|
||||||
return float(group.replace(',', '.').replace(' ', ''))
|
|
||||||
|
|
||||||
def getDateFollowing(date, initialDate):
|
|
||||||
date = datetime.strptime(date, '%d.%m').replace(year = initialDate.year)
|
|
||||||
# To support new year.
|
|
||||||
if date < initialDate:
|
|
||||||
date = date.replace(year = date.year + 1)
|
|
||||||
return date
|
|
||||||
|
|
||||||
def readPdfBankStatement(filePath):
|
|
||||||
file = filePath.split('/')[-1]
|
|
||||||
fileDatetime = getDatetimeFromFileName(file)
|
|
||||||
content = getTextFromPdf(filePath)
|
|
||||||
lines = content.splitlines()
|
|
||||||
started = False
|
|
||||||
firstPage = True
|
|
||||||
initialAmount = None
|
|
||||||
initialDate = None
|
|
||||||
currentAmount = None
|
|
||||||
date = None
|
|
||||||
comment = []
|
|
||||||
transactions = []
|
|
||||||
debitIndex = None
|
|
||||||
creditIndex = None
|
|
||||||
for line in lines:
|
|
||||||
if not started:
|
|
||||||
# We are interested in the content after this line:)
|
|
||||||
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX.fullmatch(line)
|
|
||||||
if COLUMNS_HEADER.fullmatch(line) is not None:
|
|
||||||
getIndex = lambda line, type_: line.index(type_) + len(type_)
|
|
||||||
debitIndex = getIndex(line, 'Débit')
|
|
||||||
creditIndex = getIndex(line, 'Crédit')
|
|
||||||
if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.fullmatch(line) is not None and not firstPage):
|
|
||||||
if soldeCrediteurAuRegexMatch is not None:
|
|
||||||
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
|
|
||||||
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
|
|
||||||
print(f'{initialAmount=}')
|
|
||||||
currentAmount = initialAmount
|
|
||||||
started = True
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# We aren't interested in the content after this line:
|
|
||||||
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.fullmatch(line) is not None:
|
|
||||||
firstPage = False
|
|
||||||
started = False
|
|
||||||
continue
|
|
||||||
# We aren't interested in the content after this line
|
|
||||||
else:
|
|
||||||
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.fullmatch(line)
|
|
||||||
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.fullmatch(line)
|
|
||||||
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
|
|
||||||
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
|
|
||||||
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
|
|
||||||
if totalDesOperationsRegexMatch is not None:
|
|
||||||
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
|
|
||||||
else:
|
|
||||||
totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
|
|
||||||
totalMonthlyDebit = 0
|
|
||||||
print(f'{totalMonthlyDebit=}')
|
|
||||||
print(f'{totalMonthlyCredit=}')
|
|
||||||
break
|
|
||||||
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.fullmatch(line)
|
|
||||||
if firstLineOfPaymentRegexMatch is not None:
|
|
||||||
if date is not None:
|
|
||||||
transactions += [{
|
|
||||||
'date': getDateFollowing(date, initialDate),
|
|
||||||
'valeur': getDateFollowing(valeur, initialDate),
|
|
||||||
'amount': amount,
|
|
||||||
'current amount': currentAmount,
|
|
||||||
'comment': '\n'.join(comment)
|
|
||||||
}]
|
|
||||||
date = None
|
|
||||||
date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
|
|
||||||
lineLen = len(line)
|
|
||||||
amount = toFloat(amount)
|
|
||||||
if abs(debitIndex - lineLen) < abs(creditIndex - lineLen):
|
|
||||||
amount *= -1
|
|
||||||
currentAmount += amount
|
|
||||||
comment = [firstCommentLine]
|
|
||||||
elif line != '':
|
|
||||||
comment += [line.strip()]
|
|
||||||
if date is not None:
|
|
||||||
transactions += [{
|
|
||||||
'date': getDateFollowing(date, initialDate),
|
|
||||||
'valeur': getDateFollowing(valeur, initialDate),
|
|
||||||
'amount': amount,
|
|
||||||
'current amount': currentAmount,
|
|
||||||
'comment': '\n'.join(comment)
|
|
||||||
}]
|
|
||||||
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime
|
|
Loading…
Reference in New Issue
Block a user