From 412311802080ac3f9486619bb012486b83b2978e Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Thu, 3 Oct 2024 19:51:54 +0200 Subject: [PATCH] Add and use `readPdfBankStatement` --- bnp_pdf_statement_parser.py | 76 ++++++++----------------------------- utils.py | 62 +++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 62 deletions(-) diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py index 56dd167..84fd36d 100755 --- a/bnp_pdf_statement_parser.py +++ b/bnp_pdf_statement_parser.py @@ -4,7 +4,7 @@ import os import matplotlib.pyplot as plt import matplotlib.ticker as ticker from datetime import datetime -from utils import getTextFromPdf, getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, FIRST_LINE_OF_PAYMENT_REGEX, END_PAGE_AFTER_THE_FIRST_ONE_REGEX, SOLDE_CREDITEUR_AU_REGEX, TOTAL_DES_OPERATIONS_REGEX, TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX +from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/' @@ -24,7 +24,6 @@ for folder in os.listdir(): print(file) fileDatetime = getDatetimeFromFileName(file) print(fileDatetime) - #otherBankAccountsCredits[fileDatetime] = otherBankAccountsCredits.get(fileDatetime, 0) + os.chdir(f'{MAIN_BANK_ACCOUNT}/') @@ -39,66 +38,23 @@ for folder in sorted(os.listdir()): for file in sorted(os.listdir(folder)): filePath = f'{folder}/{file}' print(filePath) - fileDatetime = getDatetimeFromFileName(file) + initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime = readPdfBankStatement(filePath) + print('Initial amount', initialAmount) + print() + totals += [initialAmount] + print(f'Total monthly debit: {totalMonthlyDebit}') + print(f'Total monthly credit: {totalMonthlyCredit}') + totalMonthlyDebits += [totalMonthlyDebit] + totalMonthlyCredits += [totalMonthlyCredit] + totalMonthlyDifference = totalMonthlyCredit - totalMonthlyDebit + totalMonthlyDifferences += [totalMonthlyDifference] + if PRINT_TRANSACTIONS: + for transaction in transactions: + print(transaction['date'], transaction['valeur'], transaction['amount'], transaction['currentAmount']) + print('\n'.join(transaction['comment'])) + print() if firstDatetime is None: firstDatetime = fileDatetime - content = getTextFromPdf(filePath) - lines = content.splitlines() - started = False - firstPage = True - initialAmount = None - currentAmount = None - date = None - comment = [] - for line in lines: - if not started: - # We are interested in the content after this line: - if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): - if SOLDE_CREDITEUR_AU_REGEX.match(line): - initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', '')) - currentAmount = initialAmount - print('Initial amount', initialAmount) - print() - totals += [initialAmount] - started = True - continue - else: - # We aren't interested in the content after this line: - if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None: - firstPage = False - started = False - continue - # We aren't interested in the content after this line - elif line.startswith('TOTAL DES OPERATIONS'): - totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) - # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense. - # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts. - toFloat = lambda group: float(group.replace(',', '.').replace(' ', '')) - if totalDesOperationsRegexMatch is not None: - totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()] - else: - totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1)) - totalMonthlyDebit = 0 - if os.getcwd().endswith(f'/{MAIN_BANK_ACCOUNT}'): - totalMonthlyCredit += otherBankAccountsCredits[fileDatetime] - print(f'Total monthly debit: {totalMonthlyDebit}') - print(f'Total monthly credit: {totalMonthlyCredit}') - totalMonthlyDebits += [totalMonthlyDebit] - totalMonthlyCredits += [totalMonthlyCredit] - totalMonthlyDifference = totalMonthlyCredit - totalMonthlyDebit - totalMonthlyDifferences += [totalMonthlyDifference] - break - if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None: - if date is not None and PRINT_TRANSACTIONS: - print(date, valeur, amount, currentAmount) - print('\n'.join(comment)) - print() - date, valeur, amount = line.split() - amount = float(amount.replace(',', '.')) - currentAmount -= amount - comment = [] - else: - comment += [line] #break #break lastDatetime = getDatetimeFromFileName(file) diff --git a/utils.py b/utils.py index 08537d3..686ca90 100644 --- a/utils.py +++ b/utils.py @@ -16,7 +16,6 @@ def getTextFromPdf(pdfPath): def getDatetimeFromFileName(aDatetimeStr): #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') - #aDatetime = datetime.strptime(aDatetimeStr[:6], '%Y%m') aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1) return aDatetime @@ -24,4 +23,63 @@ def getMonthIndexSinceEpoch(aDatetime): return aDatetime.year * 12 + aDatetime.month def getMonthNameFromMonthIndex(monthIndex): - return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y') \ No newline at end of file + return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y') + +def readPdfBankStatement(filePath): + file = filePath.split('/')[-1] + fileDatetime = getDatetimeFromFileName(file) + content = getTextFromPdf(filePath) + lines = content.splitlines() + started = False + firstPage = True + initialAmount = None + currentAmount = None + date = None + comment = [] + transactions = [] + for line in lines: + if not started: + # We are interested in the content after this line: + if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): + if SOLDE_CREDITEUR_AU_REGEX.match(line): + initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', '')) + currentAmount = initialAmount + started = True + continue + else: + # We aren't interested in the content after this line: + if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None: + firstPage = False + started = False + continue + # We aren't interested in the content after this line + elif line.startswith('TOTAL DES OPERATIONS'): + totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) + # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense. + # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts. + toFloat = lambda group: float(group.replace(',', '.').replace(' ', '')) + if totalDesOperationsRegexMatch is not None: + totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()] + else: + totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1)) + totalMonthlyDebit = 0 + #if os.getcwd().endswith(f'/{MAIN_BANK_ACCOUNT}'): + # totalMonthlyCredit += otherBankAccountsCredits[fileDatetime] + + break + if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None: + if date is not None: + transactions += [{ + 'date': date, + 'valeur': valeur, + 'amount': amount, + 'currentAmount': currentAmount, + 'comment': '\n'.join(comment) + }] + date, valeur, amount = line.split() + amount = float(amount.replace(',', '.')) + currentAmount -= amount + comment = [] + else: + comment += [line] + return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime \ No newline at end of file