2024-10-03 18:07:03 +02:00
import subprocess
from datetime import datetime
import re
FIRST_LINE_OF_PAYMENT_REGEX = re . compile ( ' \\ d {2} \\ . \\ d {2} \\ d {2} \\ . \\ d {2} \\ d+, \\ d {2} ' )
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re . compile ( ' P \\ . \\ d+/ \\ d+ ' )
SOLDE_CREDITEUR_AU_REGEX = re . compile ( ' SOLDE CREDITEUR AU \\ d {2} \\ . \\ d {2} \\ . \\ d {4} ' )
TOTAL_DES_OPERATIONS_REGEX = re . compile ( ' TOTAL \\ DES \\ OPERATIONS \\ ([0-9 ]+, \\ d {2} ) \\ ([0-9 ]+, \\ d {2} ) ' )
2024-10-03 19:33:54 +02:00
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re . compile ( ' TOTAL \\ DES \\ OPERATIONS \\ ([0-9 ]+, \\ d {2} ) ' )
2024-10-03 18:07:03 +02:00
def execute ( command ) :
return subprocess . check_output ( command ) . decode ( ' utf-8 ' )
def getTextFromPdf ( pdfPath ) :
return execute ( [ ' pdftotext ' , ' -raw ' , pdfPath , ' - ' ] )
def getDatetimeFromFileName ( aDatetimeStr ) :
2024-10-03 19:33:54 +02:00
#aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')
aDatetime = datetime ( int ( aDatetimeStr [ : 4 ] ) , int ( aDatetimeStr [ 4 : 6 ] ) , 1 )
return aDatetime
2024-10-03 18:07:03 +02:00
def getMonthIndexSinceEpoch ( aDatetime ) :
return aDatetime . year * 12 + aDatetime . month
def getMonthNameFromMonthIndex ( monthIndex ) :
2024-10-03 19:51:54 +02:00
return datetime ( ( monthIndex - 1 ) / / 12 , 1 + ( monthIndex - 1 ) % 12 , 1 ) . strftime ( ' % b % Y ' )
def readPdfBankStatement ( filePath ) :
file = filePath . split ( ' / ' ) [ - 1 ]
fileDatetime = getDatetimeFromFileName ( file )
content = getTextFromPdf ( filePath )
lines = content . splitlines ( )
started = False
firstPage = True
initialAmount = None
currentAmount = None
date = None
comment = [ ]
transactions = [ ]
for line in lines :
if not started :
# We are interested in the content after this line:
if SOLDE_CREDITEUR_AU_REGEX . match ( line ) is not None or ( line . startswith ( ' Date Nature des opérations Valeur Débit Crédit ' ) and not firstPage ) :
if SOLDE_CREDITEUR_AU_REGEX . match ( line ) :
initialAmount = float ( SOLDE_CREDITEUR_AU_REGEX . sub ( ' ' , line ) . replace ( ' , ' , ' . ' ) . replace ( ' ' , ' ' ) )
currentAmount = initialAmount
started = True
continue
else :
# We aren't interested in the content after this line:
if line . startswith ( ' BNP PARIBAS SA au capital de ' ) or END_PAGE_AFTER_THE_FIRST_ONE_REGEX . match ( line ) is not None :
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line . startswith ( ' TOTAL DES OPERATIONS ' ) :
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX . match ( line )
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
toFloat = lambda group : float ( group . replace ( ' , ' , ' . ' ) . replace ( ' ' , ' ' ) )
if totalDesOperationsRegexMatch is not None :
totalMonthlyDebit , totalMonthlyCredit = [ toFloat ( group ) for group in totalDesOperationsRegexMatch . groups ( ) ]
else :
totalMonthlyCredit = toFloat ( TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX . match ( line ) . group ( 1 ) )
totalMonthlyDebit = 0
break
if FIRST_LINE_OF_PAYMENT_REGEX . match ( line ) is not None :
2024-10-03 20:10:06 +02:00
#print(line)
2024-10-03 19:51:54 +02:00
if date is not None :
transactions + = [ {
' date ' : date ,
' valeur ' : valeur ,
' amount ' : amount ,
' currentAmount ' : currentAmount ,
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-03 20:10:06 +02:00
date = None
2024-10-03 19:51:54 +02:00
date , valeur , amount = line . split ( )
amount = float ( amount . replace ( ' , ' , ' . ' ) )
currentAmount - = amount
comment = [ ]
else :
comment + = [ line ]
2024-10-03 20:10:06 +02:00
if date is not None :
transactions + = [ {
' date ' : date ,
' valeur ' : valeur ,
' amount ' : amount ,
' currentAmount ' : currentAmount ,
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-03 19:51:54 +02:00
return initialAmount , totalMonthlyDebit , totalMonthlyCredit , transactions , fileDatetime