#!/usr/bin/python3 # Depends on `pdftotext`. import os, subprocess, re, config path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}' os.chdir(path) ''' Assuming file hierarchy like: 2022 ├── 20221121.pdf └── 20221221.pdf 2023 ├── 20230123.pdf └── 20230221.pdf ''' def execute(command): return subprocess.check_output(command, shell).decode('utf-8') def getTextFromPdf(pdfPath): return execute(['pdftotext', '-raw', pdfPath, '-') firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}') endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+') soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}') for folder in os.listdir(): for file in os.listdir(folder): #folder = '2022' #file = '20220321.pdf' filePath = f'{folder}/{file}' print(filePath) content = getTextFromPdf(filePath) lines = content.splitlines() started = False firstPage = True initialAmount = None currentAmount = None date = None comment = [] for line in lines: if not started: # We are interested in the content after this line: if soldeCrediteurAuRegex.match(line) != None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): if soldeCrediteurAuRegex.match(line): initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', '')) currentAmount = initialAmount print('Initial amount', initialAmount) print() started = True continue else: # We aren't interested in the content after this line: if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None: firstPage = False started = False continue # We aren't interested in the content after this line elif line.startswith('TOTAL DES OPERATIONS'): break if firstLineOfPaymentRegex.match(line) != None: if date != None: print(date, valeur, amount, currentAmount) print('\n'.join(comment)) print() date, valeur, amount = line.split() amount = float(amount.replace(',', '.')) currentAmount -= amount comment = [] else: comment += [line] break break