BNP_PDF_statement_parser/bnp_pdf_statement_parser.py

#!/usr/bin/python3

# Depends on `pdftotext`.

import os, subprocess, shlex, re, config

path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'

os.chdir(path)

'''
Assuming file hierarchy like:

2022
├── 20221121.pdf
└── 20221221.pdf
2023
├── 20230123.pdf
└── 20230221.pdf
'''

def execute(command):
    return subprocess.check_output(command, shell = True).decode('utf-8')

def getTextFromPdf(pdfPath):
    pdfPath = shlex.quote(pdfPath)
    return execute(f'pdftotext -raw {pdfPath} -')

firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')

for folder in os.listdir():
    for file in os.listdir(folder):
        #folder = '2022'
        #file = '20220321.pdf'
        filePath = f'{folder}/{file}'
        print(filePath)
        content = getTextFromPdf(filePath)
        lines = content.splitlines()
        started = False
        firstPage = True
        initialAmount = None
        currentAmount = None
        date = None
        comment = []
        for line in lines:
            if not started:
                # We are interested in the content after this line:
                if soldeCrediteurAuRegex.match(line) != None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
                    if soldeCrediteurAuRegex.match(line):
                        initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
                        currentAmount = initialAmount
                        print('Initial amount', initialAmount)
                        print()
                    started = True
                    continue
            else:
                # We aren't interested in the content after this line:
                if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None:
                    firstPage = False
                    started = False
                    continue
                # We aren't interested in the content after this line
                elif line.startswith('TOTAL DES OPERATIONS'):
                    break
                if firstLineOfPaymentRegex.match(line) != None:
                    if date != None:
                        print(date, valeur, amount, currentAmount)
                        print("\n".join(comment))
                        print()
                    date, valeur, amount = line.split()
                    amount = float(amount.replace(',', '.'))
                    currentAmount -= amount
                    comment = []
                else:
                    comment += [line]
        break
    break

# TODO: check year
# TODO: debit/credit
# TODO: amount after transaction
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`#!/usr/bin/python3`

			# Depends on `pdftotext`.

			`import os, subprocess, shlex, re, config`

Correct a typo in the change folder `path` 2023-06-21 00:40:31 +02:00			`path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`os.chdir(path)`

			`'''`
			`Assuming file hierarchy like:`

			`2022`
			`├── 20221121.pdf`
			`└── 20221221.pdf`
			`2023`
			`├── 20230123.pdf`
			`└── 20230221.pdf`
			`'''`

			`def execute(command):`
			`return subprocess.check_output(command, shell = True).decode('utf-8')`

			`def getTextFromPdf(pdfPath):`
			`pdfPath = shlex.quote(pdfPath)`
			`return execute(f'pdftotext -raw {pdfPath} -')`

			`firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')`
			`endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`for folder in os.listdir():`
			`for file in os.listdir(folder):`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`#folder = '2022'`
			`#file = '20220321.pdf'`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`filePath = f'{folder}/{file}'`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`print(filePath)`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`content = getTextFromPdf(filePath)`
			`lines = content.splitlines()`
			`started = False`
			`firstPage = True`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`initialAmount = None`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`currentAmount = None`
			`date = None`
			`comment = []`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`for line in lines:`
			`if not started:`
			`# We are interested in the content after this line:`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`if soldeCrediteurAuRegex.match(line) != None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):`
			`if soldeCrediteurAuRegex.match(line):`
			`initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`currentAmount = initialAmount`
			`print('Initial amount', initialAmount)`
			`print()`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`started = True`
			`continue`
			`else:`
			`# We aren't interested in the content after this line:`
			`if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None:`
			`firstPage = False`
			`started = False`
			`continue`
			`# We aren't interested in the content after this line`
			`elif line.startswith('TOTAL DES OPERATIONS'):`
			`break`
			`if firstLineOfPaymentRegex.match(line) != None:`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`if date != None:`
			`print(date, valeur, amount, currentAmount)`
			`print("\n".join(comment))`
			`print()`
			`date, valeur, amount = line.split()`
			`amount = float(amount.replace(',', '.'))`
			`currentAmount -= amount`
			`comment = []`
			`else:`
			`comment += [line]`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`break`
			`break`

			`# TODO: check year`
			`# TODO: debit/credit`
			`# TODO: amount after transaction`