2024-10-03 18:07:03 +02:00
import subprocess
from datetime import datetime
import re
2024-10-04 02:04:55 +02:00
# For not-greedy `?`, see Source: [the Stack Overflow answer 766377](https://stackoverflow.com/a/766377)
FIRST_LINE_OF_PAYMENT_REGEX = re . compile ( ' \\ +( \\ d {2} \\ . \\ d {2} ) \\ +([A-Z \\ d /.()* \\ -,]+?) \\ +( \\ d {2} \\ . \\ d {2} ) \\ +([ \\ d ]+, \\ d {2} ) ' )
2024-10-04 01:15:38 +02:00
END_PAGE_AFTER_THE_FIRST_ONE_REGEX = re . compile ( ' +RELEVE ((DE (COMPTE (CHEQUES|D \' EPARGNE LOGEMENT|LEP))|LIVRET (A|JEUNE))|LIVRET DEV. DURABLE ET SOLIDAIRE) +P \\ . \\ d+/ \\ d+ ' )
2024-10-04 00:53:33 +02:00
SOLDE_CREDITEUR_AU_REGEX = re . compile ( ' \\ +SOLDE CREDITEUR AU ( \\ d {2} \\ . \\ d {2} \\ . \\ d {4} ) \\ +([ \\ d ]+, \\ d {2} ) ' )
TOTAL_DES_OPERATIONS_REGEX = re . compile ( ' \\ +TOTAL \\ DES \\ OPERATIONS \\ +([ \\ d ]+, \\ d {2} ) \\ +([ \\ d ]+, \\ d {2} ) ' )
TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX = re . compile ( ' \\ +TOTAL \\ DES \\ OPERATIONS \\ +([ \\ d ]+, \\ d {2} ) ' )
2024-10-04 01:15:38 +02:00
COLUMNS_HEADER = re . compile ( ' +Date +Nature des opérations +Valeur +Débit +Crédit ' )
2024-10-03 18:07:03 +02:00
def execute ( command ) :
return subprocess . check_output ( command ) . decode ( ' utf-8 ' )
def getTextFromPdf ( pdfPath ) :
2024-10-04 00:53:33 +02:00
return execute ( [ ' pdftotext ' , ' -layout ' , pdfPath , ' - ' ] )
2024-10-03 18:07:03 +02:00
def getDatetimeFromFileName ( aDatetimeStr ) :
2024-10-03 19:33:54 +02:00
aDatetime = datetime ( int ( aDatetimeStr [ : 4 ] ) , int ( aDatetimeStr [ 4 : 6 ] ) , 1 )
return aDatetime
2024-10-03 18:07:03 +02:00
def getMonthIndexSinceEpoch ( aDatetime ) :
return aDatetime . year * 12 + aDatetime . month
def getMonthNameFromMonthIndex ( monthIndex ) :
2024-10-03 19:51:54 +02:00
return datetime ( ( monthIndex - 1 ) / / 12 , 1 + ( monthIndex - 1 ) % 12 , 1 ) . strftime ( ' % b % Y ' )
2024-10-03 20:36:39 +02:00
def toFloat ( group ) :
return float ( group . replace ( ' , ' , ' . ' ) . replace ( ' ' , ' ' ) )
2024-10-03 21:20:07 +02:00
def getDateFollowing ( date , initialDate ) :
date = datetime . strptime ( date , ' %d . % m ' ) . replace ( year = initialDate . year )
# To support new year.
if date < initialDate :
date = date . replace ( year = date . year + 1 )
return date
2024-10-03 19:51:54 +02:00
def readPdfBankStatement ( filePath ) :
file = filePath . split ( ' / ' ) [ - 1 ]
fileDatetime = getDatetimeFromFileName ( file )
content = getTextFromPdf ( filePath )
lines = content . splitlines ( )
started = False
firstPage = True
initialAmount = None
2024-10-03 21:20:07 +02:00
initialDate = None
2024-10-04 12:47:48 +02:00
currentAmount = None
2024-10-03 19:51:54 +02:00
date = None
comment = [ ]
transactions = [ ]
2024-10-04 01:37:47 +02:00
debitIndex = None
creditIndex = None
2024-10-03 19:51:54 +02:00
for line in lines :
if not started :
2024-10-03 21:20:07 +02:00
# We are interested in the content after this line:)
soldeCrediteurAuRegexMatch = SOLDE_CREDITEUR_AU_REGEX . match ( line )
2024-10-04 01:37:47 +02:00
if COLUMNS_HEADER . match ( line ) is not None :
getIndex = lambda line , type_ : line . index ( type_ ) + len ( type_ )
debitIndex = getIndex ( line , ' Débit ' )
creditIndex = getIndex ( line , ' Crédit ' )
if soldeCrediteurAuRegexMatch is not None or ( COLUMNS_HEADER . match ( line ) is not None and not firstPage ) :
2024-10-03 21:20:07 +02:00
if soldeCrediteurAuRegexMatch is not None :
initialDate = datetime . strptime ( soldeCrediteurAuRegexMatch . group ( 1 ) , ' %d . % m. % Y ' )
initialAmount = toFloat ( soldeCrediteurAuRegexMatch . group ( 2 ) )
2024-10-04 00:53:33 +02:00
print ( f ' { initialAmount =} ' )
2024-10-04 12:47:48 +02:00
currentAmount = initialAmount
2024-10-03 19:51:54 +02:00
started = True
continue
else :
# We aren't interested in the content after this line:
if line . startswith ( ' BNP PARIBAS SA au capital de ' ) or END_PAGE_AFTER_THE_FIRST_ONE_REGEX . match ( line ) is not None :
firstPage = False
started = False
continue
# We aren't interested in the content after this line
2024-10-04 00:53:33 +02:00
else :
2024-10-03 19:51:54 +02:00
totalDesOperationsRegexMatch = TOTAL_DES_OPERATIONS_REGEX . match ( line )
2024-10-04 00:53:33 +02:00
totalDesOperationsCreditOnlyRegexMatch = TOTAL_DES_OPERATIONS_CREDIT_ONLY_REGEX . match ( line )
if totalDesOperationsRegexMatch is not None or totalDesOperationsCreditOnlyRegexMatch is not None :
# Note that transfer between accounts will be noted in both debits and credits, as trying to cancel would make benefits show as negative debit which does not make sense.
# Cannot just consider January as benefits only as `20240122.pdf` also contains an additional transfer between my accounts.
if totalDesOperationsRegexMatch is not None :
totalMonthlyDebit , totalMonthlyCredit = [ toFloat ( group ) for group in totalDesOperationsRegexMatch . groups ( ) ]
else :
totalMonthlyCredit = toFloat ( totalDesOperationsCreditOnlyRegexMatch . group ( 1 ) )
totalMonthlyDebit = 0
print ( f ' { totalMonthlyDebit =} ' )
print ( f ' { totalMonthlyCredit =} ' )
break
2024-10-03 20:36:39 +02:00
firstLineOfPaymentRegexMatch = FIRST_LINE_OF_PAYMENT_REGEX . match ( line )
if firstLineOfPaymentRegexMatch is not None :
2024-10-03 19:51:54 +02:00
if date is not None :
transactions + = [ {
2024-10-03 21:20:07 +02:00
' date ' : getDateFollowing ( date , initialDate ) ,
' valeur ' : getDateFollowing ( valeur , initialDate ) ,
2024-10-03 19:51:54 +02:00
' amount ' : amount ,
2024-10-04 12:47:48 +02:00
' current amount ' : currentAmount ,
2024-10-03 19:51:54 +02:00
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-03 20:10:06 +02:00
date = None
2024-10-04 00:53:33 +02:00
date , firstCommentLine , valeur , amount = firstLineOfPaymentRegexMatch . groups ( )
2024-10-04 01:37:47 +02:00
lineLen = len ( line )
2024-10-03 20:36:39 +02:00
amount = toFloat ( amount )
2024-10-04 01:37:47 +02:00
if abs ( debitIndex - lineLen ) < abs ( creditIndex - lineLen ) :
amount * = - 1
2024-10-04 12:47:48 +02:00
currentAmount + = amount
2024-10-04 00:53:33 +02:00
comment = [ firstCommentLine ]
elif line != ' ' :
2024-10-04 01:15:38 +02:00
comment + = [ line . strip ( ) ]
2024-10-03 20:10:06 +02:00
if date is not None :
transactions + = [ {
2024-10-03 21:20:07 +02:00
' date ' : getDateFollowing ( date , initialDate ) ,
' valeur ' : getDateFollowing ( valeur , initialDate ) ,
2024-10-03 20:10:06 +02:00
' amount ' : amount ,
2024-10-04 12:47:48 +02:00
' current amount ' : currentAmount ,
2024-10-03 20:10:06 +02:00
' comment ' : ' \n ' . join ( comment )
} ]
2024-10-04 01:37:47 +02:00
return initialAmount , totalMonthlyDebit , totalMonthlyCredit , transactions , fileDatetime