BNP_PDF_statement_parser/bnp_pdf_statement_parser.py

#!/usr/bin/python3

# Depends on `pdftotext`.

import os, subprocess, shlex, re, config

path = '/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'

os.chdir(path)

'''
Assuming file hierarchy like:

2022
├── 20221121.pdf
└── 20221221.pdf
2023
├── 20230123.pdf
└── 20230221.pdf
'''

def execute(command):
    return subprocess.check_output(command, shell = True).decode('utf-8')

def getTextFromPdf(pdfPath):
    pdfPath = shlex.quote(pdfPath)
    return execute(f'pdftotext -raw {pdfPath} -')

firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')

for folder in os.listdir():
    for file in os.listdir(folder):
        folder = '2022'
        file = '20220321.pdf'
        print(folder, file)
        filePath = f'{folder}/{file}'
        content = getTextFromPdf(filePath)
        lines = content.splitlines()
        started = False
        firstPage = True
        payment = []
        for line in lines:
            if not started:
                # We are interested in the content after this line:
                if line.startswith('SOLDE CREDITEUR AU') or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
                    started = True
                    continue
            else:
                # We aren't interested in the content after this line:
                if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None:
                    firstPage = False
                    started = False
                    continue
                # We aren't interested in the content after this line
                elif line.startswith('TOTAL DES OPERATIONS'):
                    break
                if firstLineOfPaymentRegex.match(line) != None:
                    print()
                print(line)
        break
    break

# TODO: check year
# TODO: debit/credit
# TODO: amount after transaction