BNP_PDF_statement_parser/utils.py

import subprocess
from datetime import datetime
import re

FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}')
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+')
SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}')
TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})')
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})')

def execute(command):
    return subprocess.check_output(command).decode('utf-8')

def getTextFromPdf(pdfPath):
    return execute(['pdftotext', '-raw', pdfPath, '-'])

def getDatetimeFromFileName(aDatetimeStr):
    #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')
    aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)
    return aDatetime

def getMonthIndexSinceEpoch(aDatetime):
    return aDatetime.year * 12 + aDatetime.month

def getMonthNameFromMonthIndex(monthIndex):
    return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')

def readPdfBankStatement(filePath):
    file = filePath.split('/')[-1]
    fileDatetime = getDatetimeFromFileName(file)
    content = getTextFromPdf(filePath)
    lines = content.splitlines()
    started = False
    firstPage = True
    initialAmount = None
    currentAmount = None
    date = None
    comment = []
    transactions = []
    for line in lines:
        if not started:
            # We are interested in the content after this line:
            if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
                if SOLDE_CREDITEUR_AU_REGEX.match(line):
                    initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', ''))
                    currentAmount = initialAmount
                started = True
                continue
        else:
            # We aren't interested in the content after this line:
            if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None:
                firstPage = False
                started = False
                continue
            # We aren't interested in the content after this line
            elif line.startswith('TOTAL DES OPERATIONS'):
                totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line)
                # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
                # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
                toFloat = lambda group: float(group.replace(',', '.').replace(' ', ''))
                if totalDesOperationsRegexMatch is not None:
                    totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
                else:
                    totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1))
                    totalMonthlyDebit = 0
                break
            if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None:
                #print(line)
                if date is not None:
                    transactions += [{
                        'date': date,
                        'valeur': valeur,
                        'amount': amount,
                        'currentAmount': currentAmount,
                        'comment': '\n'.join(comment)
                    }]
                    date = None
                date, valeur, amount = line.split()
                amount = float(amount.replace(',', '.'))
                currentAmount -= amount
                comment = []
            else:
                comment += [line]
    if date is not None:
        transactions += [{
            'date': date,
            'valeur': valeur,
            'amount': amount,
            'currentAmount': currentAmount,
            'comment': '\n'.join(comment)
        }]
    return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime
Move stuff from `main.py` to `README.md` and `utils.py` 2024-10-03 18:07:03 +02:00			`import subprocess`
			`from datetime import datetime`
			`import re`

			`FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}')`
			`END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+')`
			`SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}')`
			`TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})')`
WIP adding other bank accounts 2024-10-03 19:33:54 +02:00			`TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})')`
Move stuff from `main.py` to `README.md` and `utils.py` 2024-10-03 18:07:03 +02:00
			`def execute(command):`
			`return subprocess.check_output(command).decode('utf-8')`

			`def getTextFromPdf(pdfPath):`
			`return execute(['pdftotext', '-raw', pdfPath, '-'])`

			`def getDatetimeFromFileName(aDatetimeStr):`
WIP adding other bank accounts 2024-10-03 19:33:54 +02:00			`#aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')`
			`aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1)`
			`return aDatetime`
Move stuff from `main.py` to `README.md` and `utils.py` 2024-10-03 18:07:03 +02:00
			`def getMonthIndexSinceEpoch(aDatetime):`
			`return aDatetime.year * 12 + aDatetime.month`

			`def getMonthNameFromMonthIndex(monthIndex):`
Add and use `readPdfBankStatement` 2024-10-03 19:51:54 +02:00			`return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y')`

			`def readPdfBankStatement(filePath):`
			`file = filePath.split('/')[-1]`
			`fileDatetime = getDatetimeFromFileName(file)`
			`content = getTextFromPdf(filePath)`
			`lines = content.splitlines()`
			`started = False`
			`firstPage = True`
			`initialAmount = None`
			`currentAmount = None`
			`date = None`
			`comment = []`
			`transactions = []`
			`for line in lines:`
			`if not started:`
			`# We are interested in the content after this line:`
			`if SOLDE_CREDITEUR_AU_REGEX.match(line) is not None or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):`
			`if SOLDE_CREDITEUR_AU_REGEX.match(line):`
			`initialAmount = float(SOLDE_CREDITEUR_AU_REGEX.sub('', line).replace(',', '.').replace(' ', ''))`
			`currentAmount = initialAmount`
			`started = True`
			`continue`
			`else:`
			`# We aren't interested in the content after this line:`
			`if line.startswith('BNP PARIBAS SA au capital de') or END_PAGE_AFTER_THE_FIRST_ONE_REGEX.match(line) is not None:`
			`firstPage = False`
			`started = False`
			`continue`
			`# We aren't interested in the content after this line`
			`elif line.startswith('TOTAL DES OPERATIONS'):`
			`totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line)`
			`# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.`
			# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
			`toFloat = lambda group: float(group.replace(',', '.').replace(' ', ''))`
			`if totalDesOperationsRegexMatch is not None:`
			`totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]`
			`else:`
			`totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1))`
			`totalMonthlyDebit = 0`
			`break`
			`if FIRST_LINE_OF_PAYMENT_REGEX.match(line) is not None:`
Consider last transaction 2024-10-03 20:10:06 +02:00			`#print(line)`
Add and use `readPdfBankStatement` 2024-10-03 19:51:54 +02:00			`if date is not None:`
			`transactions += [{`
			`'date': date,`
			`'valeur': valeur,`
			`'amount': amount,`
			`'currentAmount': currentAmount,`
			`'comment': '\n'.join(comment)`
			`}]`
Consider last transaction 2024-10-03 20:10:06 +02:00			`date = None`
Add and use `readPdfBankStatement` 2024-10-03 19:51:54 +02:00			`date, valeur, amount = line.split()`
			`amount = float(amount.replace(',', '.'))`
			`currentAmount -= amount`
			`comment = []`
			`else:`
			`comment += [line]`
Consider last transaction 2024-10-03 20:10:06 +02:00			`if date is not None:`
			`transactions += [{`
			`'date': date,`
			`'valeur': valeur,`
			`'amount': amount,`
			`'currentAmount': currentAmount,`
			`'comment': '\n'.join(comment)`
			`}]`
Add and use `readPdfBankStatement` 2024-10-03 19:51:54 +02:00			`return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime`