From bff59b350439111a31d02826efc92cab4fc2d301 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Wed, 21 Jun 2023 00:36:27 +0200 Subject: [PATCH] Add `bnp_pdf_statement_parser.py` --- bnp_pdf_statement_parser.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 bnp_pdf_statement_parser.py diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py new file mode 100755 index 0000000..aa39ef8 --- /dev/null +++ b/bnp_pdf_statement_parser.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 + +# Depends on `pdftotext`. + +import os, subprocess, shlex, re, config + +path = '/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}' + +os.chdir(path) + +''' +Assuming file hierarchy like: + +2022 +├── 20221121.pdf +└── 20221221.pdf +2023 +├── 20230123.pdf +└── 20230221.pdf +''' + +def execute(command): + return subprocess.check_output(command, shell = True).decode('utf-8') + +def getTextFromPdf(pdfPath): + pdfPath = shlex.quote(pdfPath) + return execute(f'pdftotext -raw {pdfPath} -') + +firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}') +endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+') + +for folder in os.listdir(): + for file in os.listdir(folder): + folder = '2022' + file = '20220321.pdf' + print(folder, file) + filePath = f'{folder}/{file}' + content = getTextFromPdf(filePath) + lines = content.splitlines() + started = False + firstPage = True + payment = [] + for line in lines: + if not started: + # We are interested in the content after this line: + if line.startswith('SOLDE CREDITEUR AU') or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): + started = True + continue + else: + # We aren't interested in the content after this line: + if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None: + firstPage = False + started = False + continue + # We aren't interested in the content after this line + elif line.startswith('TOTAL DES OPERATIONS'): + break + if firstLineOfPaymentRegex.match(line) != None: + print() + print(line) + break + break + +# TODO: check year +# TODO: debit/credit +# TODO: amount after transaction