#2: Use pdftotext -layout instead of pdftotext -raw to distinguish debit from credit

This commit is contained in:
Benjamin Loison 2024-10-04 00:53:33 +02:00
parent 69d6442966
commit 2d4a10da05
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 28 additions and 20 deletions

View File

@ -6,6 +6,7 @@ import matplotlib.ticker as ticker
from datetime import datetime from datetime import datetime
from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement from utils import getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, readPdfBankStatement
import operator import operator
from pprint import pprint
PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/' PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/'
@ -34,11 +35,12 @@ for folder in os.listdir():
filePath = f'{folder}/{file}' filePath = f'{folder}/{file}'
print(filePath) print(filePath)
print(readPdfBankStatement(filePath)) print(readPdfBankStatement(filePath))
exit(0) #exit(0)
transactions = readPdfBankStatement(filePath)[3] transactions = readPdfBankStatement(filePath)[3]
pprint(transactions)
appendTransactions(transactions, folder) appendTransactions(transactions, folder)
#exit(1) exit(1)
os.chdir(f'{MAIN_BANK_ACCOUNT}/') os.chdir(f'{MAIN_BANK_ACCOUNT}/')
for folder in sorted(os.listdir()): for folder in sorted(os.listdir()):
@ -53,7 +55,6 @@ for folder in sorted(os.listdir()):
#break #break
#break #break
from pprint import pprint
allTransactions.sort(key = operator.itemgetter('date')) allTransactions.sort(key = operator.itemgetter('date'))
print(len(allTransactions)) print(len(allTransactions))
pprint(allTransactions) pprint(allTransactions)

View File

@ -2,17 +2,18 @@ import subprocess
from datetime import datetime from datetime import datetime
import re import re
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('(\\d{2}\\.\\d{2}) (\\d{2}\\.\\d{2}) ([\\d ]+,\\d{2})') # Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})')
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+') END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+')
SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4}) ([\\d ]+,\\d{2})') SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([\\d ]+,\\d{2})\\ ([\\d ]+,\\d{2})') TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})')
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([\\d ]+,\\d{2})') TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})')
def execute(command): def execute(command):
return subprocess.check_output(command).decode('utf-8') return subprocess.check_output(command).decode('utf-8')
def getTextFromPdf(pdfPath): def getTextFromPdf(pdfPath):
return execute(['pdftotext', '-raw', pdfPath, '-']) return execute(['pdftotext', '-layout', pdfPath, '-'])
def getDatetimeFromFileName(aDatetimeStr): def getDatetimeFromFileName(aDatetimeStr):
#aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')
@ -56,6 +57,7 @@ def readPdfBankStatement(filePath):
if soldeCrediteurAuRegexMatch is not None: if soldeCrediteurAuRegexMatch is not None:
initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y') initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y')
initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2)) initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2))
print(f'{initialAmount=}')
#currentAmount = initialAmount #currentAmount = initialAmount
started = True started = True
continue continue
@ -66,15 +68,19 @@ def readPdfBankStatement(filePath):
started = False started = False
continue continue
# We aren't interested in the content after this line # We aren't interested in the content after this line
elif line.startswith('TOTAL DES OPERATIONS'): else:
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line)
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line)
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None:
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense. # Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts. # Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
if totalDesOperationsRegexMatch is not None: if totalDesOperationsRegexMatch is not None:
totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()] totalMonthlyDebit, totalMonthlyCredit = [toFloat(group) for group in totalDesOperationsRegexMatch.groups()]
else: else:
totalMonthlyCredit = toFloat(TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line).group(1)) totalMonthlyCredit = toFloat(totalDesOperationsCreditOnlyRegexMatch.group(1))
totalMonthlyDebit = 0 totalMonthlyDebit = 0
print(f'{totalMonthlyDebit=}')
print(f'{totalMonthlyCredit=}')
break break
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.match(line) firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.match(line)
if firstLineOfPaymentRegexMatch is not None: if firstLineOfPaymentRegexMatch is not None:
@ -88,11 +94,12 @@ def readPdfBankStatement(filePath):
'comment': '\n'.join(comment) 'comment': '\n'.join(comment)
}] }]
date = None date = None
date, valeur, amount = firstLineOfPaymentRegexMatch.groups() date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups()
amount = toFloat(amount) amount = toFloat(amount)
#currentAmount -= amount #currentAmount -= amount
comment = [] comment = [firstCommentLine]
else: elif line != '':
print(f'comment: {line}')
comment += [line] comment += [line]
if date is not None: if date is not None:
transactions += [{ transactions += [{