Add and use readPdfBankStatement

This commit is contained in:
Benjamin Loison 2024-10-03 19:51:54 +02:00
parent b99838696e
commit 4123118020
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 76 additions and 62 deletions

View File

@ -4,7 +4,7 @@ import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.ticker as ticker import matplotlib.ticker as ticker
from datetime import datetime from datetime import datetime
from utils import getTextFromPdf, getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, FIRST_LINE_OF_PAYMENT_REGEX, END_PAGE_AFTER_THE_FIRST_ONE_REGEX, SOLDE_CREDITEUR_AU_REGEX, TOTAL_DES_OPERATIONS_REGEX, TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/' PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
@ -24,7 +24,6 @@ for folder in os.listdir():
print(file) print(file)
fileDatetime = getDatetimeFromFileName(file) fileDatetime = getDatetimeFromFileName(file)
print(fileDatetime) print(fileDatetime)
#otherBankAccountsCredits[fileDatetime] = otherBankAccountsCredits.get(fileDatetime, 0) + #otherBankAccountsCredits[fileDatetime] = otherBankAccountsCredits.get(fileDatetime, 0) +
os.chdir(f'{MAIN_BANK_ACCOUNT}/') os.chdir(f'{MAIN_BANK_ACCOUNT}/')
@ -39,66 +38,23 @@ for folder in sorted(os.listdir()):
for file in sorted(os.listdir(folder)): for file in sorted(os.listdir(folder)):
filePath = f'{folder}/{file}' filePath = f'{folder}/{file}'
print(filePath) print(filePath)
fileDatetime = getDatetimeFromFileName(file) initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime = readPdfBankStatement(filePath)
print('Initial amount', initialAmount)
print()
totals += [initialAmount]
print(f'Total monthly debit: {totalMonthlyDebit}')
print(f'Total monthly credit: {totalMonthlyCredit}')
totalMonthlyDebits += [totalMonthlyDebit]
totalMonthlyCredits += [totalMonthlyCredit]
totalMonthlyDifference = totalMonthlyCredit - totalMonthlyDebit
totalMonthlyDifferences += [totalMonthlyDifference]
if PRINT_TRANSACTIONS:
for transaction in transactions:
print(transaction['date'], transaction['valeur'], transaction['amount'], transaction['currentAmount'])
print('\n'.join(transaction['comment']))
print()
if firstDatetime is None: if firstDatetime is None:
firstDatetime = fileDatetime firstDatetime = fileDatetime
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
currentAmount = None
date = None
comment = []
for line in lines:
if not started:
# We are interested in the content after this line:
if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
if SOLDE_CREDITEUR_AU_REGEX.match(line):
initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', ''))
currentAmount = initialAmount
print('Initial amount', initialAmount)
print()
totals += [initialAmount]
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line.startswith('TOTAL DES OPERATIONS'):
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line)
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
toFloat = lambda group: float(group.replace(',', '.').replace(' ', ''))
if totalDesOperationsRegexMatch is not None:
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
else:
totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1))
totalMonthlyDebit = 0
if os.getcwd().endswith(f'/{MAIN_BANK_ACCOUNT}'):
totalMonthlyCredit += otherBankAccountsCredits[fileDatetime]
print(f'Total monthly debit: {totalMonthlyDebit}')
print(f'Total monthly credit: {totalMonthlyCredit}')
totalMonthlyDebits += [totalMonthlyDebit]
totalMonthlyCredits += [totalMonthlyCredit]
totalMonthlyDifference = totalMonthlyCredit - totalMonthlyDebit
totalMonthlyDifferences += [totalMonthlyDifference]
break
if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None:
if date is not None and PRINT_TRANSACTIONS:
print(date, valeur, amount, currentAmount)
print('\n'.join(comment))
print()
date, valeur, amount = line.split()
amount = float(amount.replace(',', '.'))
currentAmount -= amount
comment = []
else:
comment += [line]
#break #break
#break #break
lastDatetime = getDatetimeFromFileName(file) lastDatetime = getDatetimeFromFileName(file)

View File

@ -16,7 +16,6 @@ def getTextFromPdf(pdfPath):
def getDatetimeFromFileName(aDatetimeStr): def getDatetimeFromFileName(aDatetimeStr):
#aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')
#aDatetime = datetime.strptime(aDatetimeStr[:6], '%Y%m')
aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1) aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
return aDatetime return aDatetime
@ -24,4 +23,63 @@ def getMonthIndexSinceEpoch(aDatetime):
return aDatetime.year * 12 + aDatetime.month return aDatetime.year * 12 + aDatetime.month
def getMonthNameFromMonthIndex(monthIndex): def getMonthNameFromMonthIndex(monthIndex):
return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y') return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')
def readPdfBankStatement(filePath):
file = filePath.split('/')[-1]
fileDatetime = getDatetimeFromFileName(file)
content = getTextFromPdf(filePath)
lines = content.splitlines()
started = False
firstPage = True
initialAmount = None
currentAmount = None
date = None
comment = []
transactions = []
for line in lines:
if not started:
# We are interested in the content after this line:
if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
if SOLDE_CREDITEUR_AU_REGEX.match(line):
initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', ''))
currentAmount = initialAmount
started = True
continue
else:
# We aren't interested in the content after this line:
if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None:
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line.startswith('TOTAL DES OPERATIONS'):
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line)
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
toFloat = lambda group: float(group.replace(',', '.').replace(' ', ''))
if totalDesOperationsRegexMatch is not None:
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
else:
totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1))
totalMonthlyDebit = 0
#if os.getcwd().endswith(f'/{MAIN_BANK_ACCOUNT}'):
# totalMonthlyCredit += otherBankAccountsCredits[fileDatetime]
break
if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None:
if date is not None:
transactions += [{
'date': date,
'valeur': valeur,
'amount': amount,
'currentAmount': currentAmount,
'comment': '\n'.join(comment)
}]
date, valeur, amount = line.split()
amount = float(amount.replace(',', '.'))
currentAmount -= amount
comment = []
else:
comment += [line]
return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime