diff --git a/bnp_pdf_statement_parser.py b/bnp_pdf_statement_parser.py index d77fc6a..58259b9 100755 --- a/bnp_pdf_statement_parser.py +++ b/bnp_pdf_statement_parser.py @@ -12,12 +12,10 @@ PATH = f'/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/bank_statements os.chdir(PATH) -PRINT_TRANSACTIONS = True +PRINT_TRANSACTIONS = False MAIN_BANK_ACCOUNT = 'compte_de_cheques' -debits = [] -credits_ = [] allTransactions = [] monthlyTransactions = {} @@ -34,13 +32,10 @@ for folder in os.listdir(): for file in os.listdir(folder): filePath = f'{folder}/{file}' print(filePath) - #print(readPdfBankStatement(filePath)) - #exit(0) transactions = readPdfBankStatement(filePath)[3] pprint(transactions) appendTransactions(transactions, folder) -#exit(1) os.chdir(f'{MAIN_BANK_ACCOUNT}/') for folder in sorted(os.listdir()): @@ -54,22 +49,20 @@ for folder in sorted(os.listdir()): #break #break -exit(1) allTransactions.sort(key = operator.itemgetter('date')) print(len(allTransactions)) -pprint(allTransactions) +#pprint(allTransactions) + +amount = X_XXX.XX +for transaction in allTransactions: + if transaction['bank account'] == MAIN_BANK_ACCOUNT: + amount += transaction['amount'] + print(transaction['date'], amount, transaction['amount']) +print(amount) sortedMonths = sorted(monthlyTransactions.keys()) #pprint(sortedMonths) -# debit or credit? -totalMonthlyCredits = [] -totalMonthlyDebits = [] -totalMonthlyDifferences = [] -#totalMonthlyDebits = [[ for transaction in monthlyTransactions[month]] for month in sortedMonths] -for month in sortedMonths: - #for transaction in monthlyTransactions[month]: - currentMonthlyTransactions = monthlyTransactions[month] - monthlyCredits = [] +totalMonthlyDifferences = [sum([transaction['amount'] for transaction in monthlyTransactions[month]]) for month in sortedMonths] fig, ax = plt.subplots() plt.title('Monthly debits and credits') diff --git a/utils.py b/utils.py index a5f2746..99b0556 100644 --- a/utils.py +++ b/utils.py @@ -2,8 +2,8 @@ import subprocess from datetime import datetime import re -# Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377) -FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})') +# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377) +FIRST_LINE_OF_PAYMENT_REGEX = re.compile('\\ +(\\d{2}\\.\\d{2})\\ +([A-Z\\d /.()*\\-,]+?)\\ +(\\d{2}\\.\\d{2})\\ +([\\d ]+,\\d{2})') END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re.compile(' +RELEVE ((DE (COMPTE (CHEQUES|D\'EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P\\. \\d+/\\d+') SOLDE_CREDITEUR_AU_REGEX = re.compile('\\ +SOLDE CREDITEUR AU (\\d{2}\\.\\d{2}\\.\\d{4})\\ +([\\d ]+,\\d{2})') TOTAL_DES_OPERATIONS_REGEX = re.compile('\\ +TOTAL\\ DES\\ OPERATIONS\\ +([\\d ]+,\\d{2})\\ +([\\d ]+,\\d{2})') @@ -17,7 +17,6 @@ def getTextFromPdf(pdfPath): return execute(['pdftotext', '-layout', pdfPath, '-']) def getDatetimeFromFileName(aDatetimeStr): - #aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf') aDatetime = datetime(int(aDatetimeStr[:4]), int(aDatetimeStr[4:6]), 1) return aDatetime @@ -60,19 +59,12 @@ def readPdfBankStatement(filePath): getIndex = lambda line, type_: line.index(type_) + len(type_) debitIndex = getIndex(line, 'Débit') creditIndex = getIndex(line, 'Crédit') - #print(f'{line.index("Débit") + len("Débit")=}') - #print(f'{line.index("Crédit") + len("Crédit")=}') if soldeCrediteurAuRegexMatch is not None or (COLUMNS_HEADER.match(line) is not None and not firstPage): if soldeCrediteurAuRegexMatch is not None: initialDate = datetime.strptime(soldeCrediteurAuRegexMatch.group(1), '%d.%m.%Y') initialAmount = toFloat(soldeCrediteurAuRegexMatch.group(2)) print(f'{initialAmount=}') #currentAmount = initialAmount - ''' - else: - print(f'{line.index("Débit")=}') - print(f'{line.index("Crédit")=}') - ''' started = True continue else: @@ -83,7 +75,6 @@ def readPdfBankStatement(filePath): continue # We aren't interested in the content after this line else: - #print('hey', line) totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX.match(line) totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX.match(line) if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None: @@ -99,7 +90,6 @@ def readPdfBankStatement(filePath): break firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX.match(line) if firstLineOfPaymentRegexMatch is not None: - #print(line) if date is not None: transactions += [{ 'date': getDateFollowing(date, initialDate), @@ -108,19 +98,15 @@ def readPdfBankStatement(filePath): #'currentAmount': currentAmount, 'comment': '\n'.join(comment) }] - #print('index', lastIndex(line, ' ')) - #print(f'!{line}!') - #print(amount, len(line)) date = None date, firstCommentLine, valeur, amount = firstLineOfPaymentRegexMatch.groups() lineLen = len(line) amount = toFloat(amount) if abs(debitIndex - lineLen) < abs(creditIndex - lineLen): amount *= -1 - #currentAmount -= amount + #currentAmount += amount comment = [firstCommentLine] elif line != '': - #print(f'comment: {line}') comment += [line.strip()] if date is not None: transactions += [{ @@ -131,7 +117,3 @@ def readPdfBankStatement(filePath): 'comment': '\n'.join(comment) }] return initialAmount, totalMonthlyDebit, totalMonthlyCredit, transactions, fileDatetime - -def lastIndex(myStr, character): - index = myStr[::-1].index(character) - return len(myStr) - index - 1