BNP_PDF_statement_parser/bnp_pdf_statement_parser.py

#!/usr/bin/python3

# Depends on `pdftotext`.

import os
import subprocess
import re

path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'

os.chdir(path)

'''
Assuming file hierarchy like:

2022
├── 20221121.pdf
└── 20221221.pdf
2023
├── 20230123.pdf
└── 20230221.pdf
'''

def execute(command):
    return subprocess.check_output(command).decode('utf-8')

def getTextFromPdf(pdfPath):
    return execute(['pdftotext', '-raw', pdfPath, '-'])

firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')

for folder in os.listdir():
    for file in os.listdir(folder):
        #folder = '2022'
        #file = '20220321.pdf'
        filePath = f'{folder}/{file}'
        print(filePath)
        content = getTextFromPdf(filePath)
        lines = content.splitlines()
        started = False
        firstPage = True
        initialAmount = None
        currentAmount = None
        date = None
        comment = []
        for line in lines:
            if not started:
                # We are interested in the content after this line:
                if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
                    if soldeCrediteurAuRegex.match(line):
                        initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))
                        currentAmount = initialAmount
                        print('Initial amount', initialAmount)
                        print()
                    started = True
                    continue
            else:
                # We aren't interested in the content after this line:
                if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:
                    firstPage = False
                    started = False
                    continue
                # We aren't interested in the content after this line
                elif line.startswith('TOTAL DES OPERATIONS'):
                    break
                if firstLineOfPaymentRegex.match(line) is not None:
                    if date is not None:
                        print(date, valeur, amount, currentAmount)
                        print('\n'.join(comment))
                        print()
                    date, valeur, amount = line.split()
                    amount = float(amount.replace(',', '.'))
                    currentAmount -= amount
                    comment = []
                else:
                    comment += [line]
        break
    break
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`#!/usr/bin/python3`

			# Depends on `pdftotext`.

Remove `config` dependency 2023-11-05 21:10:45 +01:00			`import os`
			`import subprocess`
			`import re`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
Remove `config` dependency 2023-11-05 21:10:45 +01:00			`path = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`os.chdir(path)`

			`'''`
			`Assuming file hierarchy like:`

			`2022`
			`├── 20221121.pdf`
			`└── 20221221.pdf`
			`2023`
			`├── 20230123.pdf`
			`└── 20230221.pdf`
			`'''`

			`def execute(command):`
Correct a typo 2023-07-28 18:15:03 +02:00			`return subprocess.check_output(command).decode('utf-8')`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`def getTextFromPdf(pdfPath):`
Remove `config` dependency 2023-11-05 21:10:45 +01:00			`return execute(['pdftotext', '-raw', pdfPath, '-'])`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')`
			`endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`soldeCrediteurAuRegex = re.compile('SOLDE CREDITEUR AU \d{2}\.\d{2}\.\d{4}')`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00
			`for folder in os.listdir():`
			`for file in os.listdir(folder):`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`#folder = '2022'`
			`#file = '20220321.pdf'`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`filePath = f'{folder}/{file}'`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`print(filePath)`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`content = getTextFromPdf(filePath)`
			`lines = content.splitlines()`
			`started = False`
			`firstPage = True`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`initialAmount = None`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`currentAmount = None`
			`date = None`
			`comment = []`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`for line in lines:`
			`if not started:`
			`# We are interested in the content after this line:`
Use `is None` instead of `== None` 2023-09-14 20:18:51 +02:00			`if soldeCrediteurAuRegex.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):`
Add `initialAmount` parsing 2023-06-21 00:57:16 +02:00			`if soldeCrediteurAuRegex.match(line):`
			`initialAmount = float(soldeCrediteurAuRegex.sub('', line).replace(',', '.').replace(' ', ''))`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`currentAmount = initialAmount`
			`print('Initial amount', initialAmount)`
			`print()`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`started = True`
			`continue`
			`else:`
			`# We aren't interested in the content after this line:`
Use `is None` instead of `== None` 2023-09-14 20:18:51 +02:00			`if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) is not None:`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`firstPage = False`
			`started = False`
			`continue`
			`# We aren't interested in the content after this line`
			`elif line.startswith('TOTAL DES OPERATIONS'):`
			`break`
Use `is None` instead of `== None` 2023-09-14 20:18:51 +02:00			`if firstLineOfPaymentRegex.match(line) is not None:`
			`if date is not None:`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`print(date, valeur, amount, currentAmount)`
Replace `"\n"` with `n` in Python scripts 2023-07-28 14:54:53 +02:00			`print('\n'.join(comment))`
Add account amount after each transaction 2023-06-21 01:05:48 +02:00			`print()`
			`date, valeur, amount = line.split()`
			`amount = float(amount.replace(',', '.'))`
			`currentAmount -= amount`
			`comment = []`
			`else:`
			`comment += [line]`
Add `bnp_pdf_statement_parser.py` 2023-06-21 00:36:27 +02:00			`break`
			`break`