diff --git a/README.md b/README.md new file mode 100644 index 0000000..458e674 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# BNP PDF statement parser + +Depends on `pdftotext`. + +Assuming file hierarchy like: + +``` +. +├── compte_de_cheques/ +│ ├── 2022/ +│ │ ├── 20221121.pdf +│ │ └── 20221221.pdf +│ └── 2023/ +│ ├── 20230123.pdf +│ └── 20230221.pdf +livret_a/ +├── 20230721.pdf +└── 20240122.pdf +``` diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py index 7646fd1..ea01264 100755 --- a/bnp_pdf_statement_parser.py +++ b/bnp_pdf_statement_parser.py @@ -1,40 +1,15 @@ #!/usr/bin/env python -# Depends on `pdftotext`. - import os -import subprocess -import re import matplotlib.pyplot as plt import matplotlib.ticker as ticker from datetime import datetime +from utils import getTextFromPdf, getDatetimeFromFileName, getMonthIndexSinceEpoch, getMonthNameFromMonthIndex, FIRST_LINE_OF_PAYMENT_REGEX, END_PAGE_AFTER_THE_FIRST_ONE_REGEX, SOLDE_CREDITEUR_AU_REGEX, TOTAL_DES_OPERATIONS_REGEX PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements/compte_de_cheques/' os.chdir(PATH) -''' -Assuming file hierarchy like: - -2022 -├── 20221121.pdf -└── 20221221.pdf -2023 -├── 20230123.pdf -└── 20230221.pdf -''' - -def execute(command): - return subprocess.check_output(command).decode('utf-8') - -def getTextFromPdf(pdfPath): - return execute(['pdftotext', '-raw', pdfPath, '-']) - -FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}') -END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+') -SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}') -TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})') - PRINT_TRANSACTIONS = False totalMonthlyDebits = [] @@ -47,7 +22,7 @@ for folder in sorted(os.listdir()): for file in sorted(os.listdir(folder)): filePath = f'{folder}/{file}' print(filePath) - currentDatetime = getDatetime(file) + currentDatetime = getDatetimeFromFileName(file) if firstDatetime is None: firstDatetime = currentDatetime content = getTextFromPdf(filePath) @@ -100,7 +75,7 @@ for folder in sorted(os.listdir()): comment += [line] #break #break -lastDatetime = getDatetime(file) +lastDatetime = getDatetimeFromFileName(file) fig, ax = plt.subplots() plt.title('Monthly debits and credits') @@ -108,14 +83,7 @@ plt.xlabel('Date') plt.ylabel('€') ALPHA = 0.5 -def getDatetime(aDatetimeStr): - return datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') - -def getMonthIndex(aDatetime): - return aDatetime.year * 12 + aDatetime.month - -xTicks = range(getMonthIndex(firstDatetime), getMonthIndex(lastDatetime) + 1) -# sign does not seem respected for `totalMonthlyDifferences`. +xTicks = range(getMonthIndexSinceEpoch(firstDatetime), getMonthIndexSinceEpoch(lastDatetime) + 1) totalMonthlyAmountAndLabel = ( #(totalMonthlyDebits, 'Debit'), #(totalMonthlyCredits, 'Credit'), @@ -129,10 +97,7 @@ plt.legend() #plt.yscale('symlog') ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,}')) -def getMonthName(monthIndex): - return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y') - -ticksLabels = [getMonthName(monthIndex) for monthIndex in xTicks] +ticksLabels = [getMonthNameFromMonthIndex(monthIndex) for monthIndex in xTicks] plt.xticks(xTicks, ticksLabels, rotation = 90) #plt.tight_layout() # How to show the horizontal lines for subticks? diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..83ca5dd --- /dev/null +++ b/utils.py @@ -0,0 +1,23 @@ +import subprocess +from datetime import datetime +import re + +FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\d{2}\\.\\d{2} \\d{2}\\.\\d{2} \\d+,\\d{2}') +END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile('P\\. \\d+/\\d+') +SOLDE_CREDITEUR_AU_REGEX = re.compile('SOLDE CREDITEUR AU \\d{2}\\.\\d{2}\\.\\d{4}') +TOTAL_DES_OPERATIONS_REGEX = re.compile('TOTAL\\ DES\\ OPERATIONS\\ ([0-9 ]+,\\d{2})\\ ([0-9 ]+,\\d{2})') + +def execute(command): + return subprocess.check_output(command).decode('utf-8') + +def getTextFromPdf(pdfPath): + return execute(['pdftotext', '-raw', pdfPath, '-']) + +def getDatetimeFromFileName(aDatetimeStr): + return datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') + +def getMonthIndexSinceEpoch(aDatetime): + return aDatetime.year * 12 + aDatetime.month + +def getMonthNameFromMonthIndex(monthIndex): + return datetime((monthIndex - 1) // 12, 1 + (monthIndex - 1) % 12, 1).strftime('%b %Y') \ No newline at end of file