2024-10-03 18:07:03 +02:00
import subprocess
from datetime import datetime
import re
2024-10-03 20:36:39 +02:00
FIRST_LINE_OF_PAYMENT_REGEX = re . compile ( ' ( \\ d {2} \\ . \\ d {2} ) ( \\ d {2} \\ . \\ d {2} ) ([ \\ d ]+, \\ d {2} ) ' )
2024-10-03 18:07:03 +02:00
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re . compile ( ' P \\ . \\ d+/ \\ d+ ' )
2024-10-03 21:20:07 +02:00
SOLDE_CREDITEUR_AU_REGEX = re . compile ( ' SOLDE CREDITEUR AU ( \\ d {2} \\ . \\ d {2} \\ . \\ d {4} ) ([ \\ d ]+, \\ d {2} ) ' )
2024-10-03 20:36:39 +02:00
TOTAL_DES_OPERATIONS_REGEX = re . compile ( ' TOTAL \\ DES \\ OPERATIONS \\ ([ \\ d ]+, \\ d {2} ) \\ ([ \\ d ]+, \\ d {2} ) ' )
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re . compile ( ' TOTAL \\ DES \\ OPERATIONS \\ ([ \\ d ]+, \\ d {2} ) ' )
2024-10-03 18:07:03 +02:00
def execute ( command ) :
return subprocess . check_output ( command ) . decode ( ' utf-8 ' )
def getTextFromPdf ( pdfPath ) :
return execute ( [ ' pdftotext ' , ' -raw ' , pdfPath , ' - ' ] )
def getDatetimeFromFileName ( aDatetimeStr ) :
2024-10-03 19:33:54 +02:00
#aDatetime = datetime.strptime(aDatetimeStr, '%Y%m%d.pdf')
aDatetime = datetime ( int ( aDatetimeStr [ : 4 ] ) , int ( aDatetimeStr [ 4 : 6 ] ) , 1 )
return aDatetime
2024-10-03 18:07:03 +02:00
def getMonthIndexSinceEpoch ( aDatetime ) :
return aDatetime . year * 12 + aDatetime . month
def getMonthNameFromMonthIndex ( monthIndex ) :
2024-10-03 19:51:54 +02:00
return datetime ( ( monthIndex - 1 ) / / 12 , 1 + ( monthIndex - 1 ) % 12 , 1 ) . strftime ( ' % b % Y ' )
2024-10-03 20:36:39 +02:00
def toFloat ( group ) :
return float ( group . replace ( ' , ' , ' . ' ) . replace ( ' ' , ' ' ) )
2024-10-03 21:20:07 +02:00
def getDateFollowing ( date , initialDate ) :
date = datetime . strptime ( date , ' %d . % m ' ) . replace ( year = initialDate . year )
# To support new year.
if date < initialDate :
date = date . replace ( year = date . year + 1 )
return date
2024-10-03 19:51:54 +02:00
def readPdfBankStatement ( filePath ) :
file = filePath . split ( ' / ' ) [ - 1 ]
fileDatetime = getDatetimeFromFileName ( file )
content = getTextFromPdf ( filePath )
lines = content . splitlines ( )
started = False
firstPage = True
initialAmount = None
2024-10-03 21:20:07 +02:00
initialDate = None
2024-10-03 20:36:39 +02:00
#currentAmount = None
2024-10-03 19:51:54 +02:00
date = None
comment = [ ]
transactions = [ ]
for line in lines :
if not started :
2024-10-03 21:20:07 +02:00
# We are interested in the content after this line:)
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX . match ( line )
if soldeCrediteurAuRegexMatch is not None or ( line . startswith ( ' Date Nature des opérations Valeur Débit Crédit ' ) and not firstPage ) :
if soldeCrediteurAuRegexMatch is not None :
initialDate = datetime . strptime ( soldeCrediteurAuRegexMatch . group ( 1 ) , ' %d . % m. % Y ' )
initialAmount = toFloat ( soldeCrediteurAuRegexMatch . group ( 2 ) )
2024-10-03 20:36:39 +02:00
#currentAmount = initialAmount
2024-10-03 19:51:54 +02:00
started = True
continue
else :
# We aren't interested in the content after this line:
if line . startswith ( ' BNP PARIBAS SA au capital de ' ) or END_PAGE_AFTER_THE_FIRST_ONE_REGEX . match ( line ) is not None :
firstPage = False
started = False
continue
# We aren't interested in the content after this line
elif line . startswith ( ' TOTAL DES OPERATIONS ' ) :
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX . match ( line )
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
if totalDesOperationsRegexMatch is not None :
totalMonthlyDebit , totalMonthlyCredit = [ toFloat ( group ) for group in totalDesOperationsRegexMatch . groups ( ) ]
else :
totalMonthlyCredit = toFloat ( TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX . match ( line ) . group ( 1 ) )
totalMonthlyDebit = 0
break
2024-10-03 20:36:39 +02:00
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX . match ( line )
if firstLineOfPaymentRegexMatch is not None :
2024-10-03 19:51:54 +02:00
if date is not None :
transactions + = [ {
2024-10-03 21:20:07 +02:00
' date ' : getDateFollowing ( date , initialDate ) ,
' valeur ' : getDateFollowing ( valeur , initialDate ) ,
2024-10-03 19:51:54 +02:00
' amount ' : amount ,
2024-10-03 20:36:39 +02:00
#'currentAmount': currentAmount,
2024-10-03 19:51:54 +02:00
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-03 20:10:06 +02:00
date = None
2024-10-03 20:36:39 +02:00
date , valeur , amount = firstLineOfPaymentRegexMatch . groups ( )
amount = toFloat ( amount )
#currentAmount -= amount
2024-10-03 19:51:54 +02:00
comment = [ ]
else :
comment + = [ line ]
2024-10-03 20:10:06 +02:00
if date is not None :
transactions + = [ {
2024-10-03 21:20:07 +02:00
' date ' : getDateFollowing ( date , initialDate ) ,
' valeur ' : getDateFollowing ( valeur , initialDate ) ,
2024-10-03 20:10:06 +02:00
' amount ' : amount ,
2024-10-03 20:36:39 +02:00
#'currentAmount': currentAmount,
2024-10-03 20:10:06 +02:00
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-03 21:22:30 +02:00
return initialAmount , totalMonthlyDebit , totalMonthlyCredit , transactions , fileDatetime
def printTransaction ( transaction ) :
# , transaction['currentAmount']
print ( transaction [ ' date ' ] , transaction [ ' valeur ' ] , transaction [ ' amount ' ] )
print ( transaction [ ' comment ' ] )
print ( )