Add bnp_pdf_statement_parser.py
				
					
				
			This commit is contained in:
		
							
								
								
									
										66
									
								
								bnp_pdf_statement_parser.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										66
									
								
								bnp_pdf_statement_parser.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,66 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/python3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Depends on `pdftotext`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import os, subprocess, shlex, re, config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					path = '/home/benjamin/Desktop/bens_folder/bazaar/documents/bnp/RLV_CHQ_{config.RLV_CHQ}'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					os.chdir(path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					Assuming file hierarchy like:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2022
 | 
				
			||||||
 | 
					├── 20221121.pdf
 | 
				
			||||||
 | 
					└── 20221221.pdf
 | 
				
			||||||
 | 
					2023
 | 
				
			||||||
 | 
					├── 20230123.pdf
 | 
				
			||||||
 | 
					└── 20230221.pdf
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def execute(command):
 | 
				
			||||||
 | 
					    return subprocess.check_output(command, shell = True).decode('utf-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def getTextFromPdf(pdfPath):
 | 
				
			||||||
 | 
					    pdfPath = shlex.quote(pdfPath)
 | 
				
			||||||
 | 
					    return execute(f'pdftotext -raw {pdfPath} -')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					firstLineOfPaymentRegex = re.compile('\d{2}\.\d{2} \d{2}\.\d{2} \d+,\d{2}')
 | 
				
			||||||
 | 
					endPageAfterTheFirstOneRegex = re.compile('P\. \d+/\d+')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for folder in os.listdir():
 | 
				
			||||||
 | 
					    for file in os.listdir(folder):
 | 
				
			||||||
 | 
					        folder = '2022'
 | 
				
			||||||
 | 
					        file = '20220321.pdf'
 | 
				
			||||||
 | 
					        print(folder, file)
 | 
				
			||||||
 | 
					        filePath = f'{folder}/{file}'
 | 
				
			||||||
 | 
					        content = getTextFromPdf(filePath)
 | 
				
			||||||
 | 
					        lines = content.splitlines()
 | 
				
			||||||
 | 
					        started = False
 | 
				
			||||||
 | 
					        firstPage = True
 | 
				
			||||||
 | 
					        payment = []
 | 
				
			||||||
 | 
					        for line in lines:
 | 
				
			||||||
 | 
					            if not started:
 | 
				
			||||||
 | 
					                # We are interested in the content after this line:
 | 
				
			||||||
 | 
					                if line.startswith('SOLDE CREDITEUR AU') or (line.startswith('Date Nature des opérations Valeur Débit Crédit') and not firstPage):
 | 
				
			||||||
 | 
					                    started = True
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # We aren't interested in the content after this line:
 | 
				
			||||||
 | 
					                if line.startswith('BNP PARIBAS SA au capital de') or endPageAfterTheFirstOneRegex.match(line) != None:
 | 
				
			||||||
 | 
					                    firstPage = False
 | 
				
			||||||
 | 
					                    started = False
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                # We aren't interested in the content after this line
 | 
				
			||||||
 | 
					                elif line.startswith('TOTAL DES OPERATIONS'):
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					                if firstLineOfPaymentRegex.match(line) != None:
 | 
				
			||||||
 | 
					                    print()
 | 
				
			||||||
 | 
					                print(line)
 | 
				
			||||||
 | 
					        break
 | 
				
			||||||
 | 
					    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO: check year
 | 
				
			||||||
 | 
					# TODO: debit/credit
 | 
				
			||||||
 | 
					# TODO: amount after transaction
 | 
				
			||||||
		Reference in New Issue
	
	Block a user