From 2d4a10da0527207b73be3725cc615aced23fde11 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Fri, 4 Oct 2024 00:53:33 +0200 Subject: [PATCH] #2: Use `pdftotext -layout` instead of `pdftotext -raw` to distinguish debit from credit --- bnp_pdf_statement_parser.py | 7 ++++--- utils.py | 41 ++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py index fb11ef6..b61aadd 100755 --- a/bnp_pdf_statement_parser.py +++ b/bnp_pdf_statement_parser.py @@ -6,6 +6,7 @@ import matplotlib.ticker as ticker from datetime import datetime from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement import operator +from pprint import pprint PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/' @@ -34,11 +35,12 @@ for folder in os.listdir(): filePath = f'{folder}/{file}' print(filePath) print(readPdfBankStatement(filePath)) - exit(0) + #exit(0) transactions = readPdfBankStatement(filePath)[3] + pprint(transactions) appendTransactions(transactions, folder) -#exit(1) +exit(1) os.chdir(f'{MAIN_BANK_ACCOUNT}/') for folder in sorted(os.listdir()): @@ -53,7 +55,6 @@ for folder in sorted(os.listdir()): #break #break -from pprint import pprint allTransactions.sort(key = operator.itemgetter('date')) print(len(allTransactions)) pprint(allTransactions) diff --git a/utils.py b/utils.py index c8adc21..d793850 100644 --- a/utils.py +++ b/utils.py @@ -2,17 +2,18 @@ import subprocess from datetime import datetime import re -FIRST_LINE_OF_PAYMENT_REGEX = re.compile('(\\d{2}\\.\\d{2}) (\\d{2}\\.\\d{2}) ([\\d ]+,\\d{2})') +# Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377) +FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})') END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+') -SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4}) ([\\d ]+,\\d{2})') -TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([\\d ]+,\\d{2})\\ ([\\d ]+,\\d{2})') -TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([\\d ]+,\\d{2})') +SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})') +TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})') +TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})') def execute(command): return subprocess.check_output(command).decode('utf-8') def getTextFromPdf(pdfPath): - return execute(['pdftotext', '-raw', pdfPath, '-']) + return execute(['pdftotext', '-layout', pdfPath, '-']) def getDatetimeFromFileName(aDatetimeStr): #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') @@ -56,6 +57,7 @@ def readPdfBankStatement(filePath): if soldeCrediteurAuRegexMatch is not None: initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y') initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2)) + print(f'{initialAmount=}') #currentAmount = initialAmount started = True continue @@ -66,16 +68,20 @@ def readPdfBankStatement(filePath): started = False continue # We aren't interested in the content after this line - elif line.startswith('TOTAL DES OPERATIONS'): + else: totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) - # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense. - # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts. - if totalDesOperationsRegexMatch is not None: - totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()] - else: - totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1)) - totalMonthlyDebit = 0 - break + totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line) + if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None: + # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense. + # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts. + if totalDesOperationsRegexMatch is not None: + totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()] + else: + totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1)) + totalMonthlyDebit = 0 + print(f'{totalMonthlyDebit=}') + print(f'{totalMonthlyCredit=}') + break firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.match(line) if firstLineOfPaymentRegexMatch is not None: print(line) @@ -88,11 +94,12 @@ def readPdfBankStatement(filePath): 'comment': '\n'.join(comment) }] date = None - date, valeur, amount = firstLineOfPaymentRegexMatch.groups() + date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups() amount = toFloat(amount) #currentAmount -= amount - comment = [] - else: + comment = [firstCommentLine] + elif line != '': + print(f'comment: {line}') comment += [line] if date is not None: transactions += [{