diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py index d257fff..7646fd1 100755 --- a/bnp_pdf_statement_parser.py +++ b/bnp_pdf_statement_parser.py @@ -9,9 +9,9 @@ import matplotlib.pyplot as plt import matplotlib.ticker as ticker from datetime import datetime -path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/compte_de_cheques/' +PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/compte_de_cheques/' -os.chdir(path) +os.chdir(PATH) ''' Assuming file hierarchy like: @@ -30,10 +30,10 @@ def execute(command): def getTextFromPdf(pdfPath): return execute(['pdftotext', '-raw', pdfPath, '-']) -firstLineOfPaymentRegex = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}') -endPageAfterTheFirstOneRegex = re.compile('P\\. \\d+/\\d+') -soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}') -totalDesOperationsRegex = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})') +FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}') +END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+') +SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}') +TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})') PRINT_TRANSACTIONS = False @@ -61,9 +61,9 @@ for folder in sorted(os.listdir()): for line in lines: if not started: # We are interested in the content after this line: - if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): - if soldeCrediteurAuRegex.match(line): - initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', '')) + if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): + if SOLDE_CREDITEUR_AU_REGEX.match(line): + initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', '')) currentAmount = initialAmount print('Initial amount', initialAmount) print() @@ -72,13 +72,13 @@ for folder in sorted(os.listdir()): continue else: # We aren't interested in the content after this line: - if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None: + if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None: firstPage = False started = False continue # We aren't interested in the content after this line elif line.startswith('TOTAL DES OPERATIONS'): - totalDesOperationsRegexMatch = totalDesOperationsRegex.match(line) + totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) totalMonthlyDebit, totalMonthlyCredit = [float(group.replace(',', '.').replace(' ', '')) for group in totalDesOperationsRegexMatch.groups()] print(f'Total monthly debit: {totalMonthlyDebit}') print(f'Total monthly credit: {totalMonthlyCredit}') @@ -87,7 +87,7 @@ for folder in sorted(os.listdir()): totalMonthlyDifference = totalMonthlyCredit - totalMonthlyDebit totalMonthlyDifferences += [totalMonthlyDifference] break - if firstLineOfPaymentRegex.match(line) is not None: + if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None: if date is not None and PRINT_TRANSACTIONS: print(date, valeur, amount, currentAmount) print('\n'.join(comment))