#!/usr/bin/python3 # Depends on `pdftotext`. import os, subprocess, shlex, re, config path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}' os.chdir(path) ''' Assuming file hierarchy like: 2022 ├── 20221121.pdf └── 20221221.pdf 2023 ├── 20230123.pdf └── 20230221.pdf ''' def execute(command): return subprocess.check_output(command, shell = True).decode('utf-8') def getTextFromPdf(pdfPath): pdfPath = shlex.quote(pdfPath) return execute(f'pdftotext -raw {pdfPath} -') firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}') endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+') soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}') for folder in os.listdir(): for file in os.listdir(folder): #folder = '2022' #file = '20220321.pdf' print(folder, file) filePath = f'{folder}/{file}' content = getTextFromPdf(filePath) lines = content.splitlines() started = False firstPage = True payment = [] initialAmount = None for line in lines: if not started: # We are interested in the content after this line: if soldeCrediteurAuRegex.match(line) != None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage): if soldeCrediteurAuRegex.match(line): initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', '')) print(initialAmount) started = True continue else: # We aren't interested in the content after this line: if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None: firstPage = False started = False continue # We aren't interested in the content after this line elif line.startswith('TOTAL DES OPERATIONS'): break if firstLineOfPaymentRegex.match(line) != None: print() print(line) break break # TODO: check year # TODO: debit/credit # TODO: amount after transaction