from sepa import parser
import re
import csv
import pandas as pd
import numpy as np
# Utility function to remove additional namespaces from the XML
def strip_namespace(xml):
return re.sub(' xmlns="[^"] "', '', xml, count=1)
# Read file
with open('test.xml', 'r') as f:
input_data = f.read()
# Parse the bank statement XML to dictionary
camt_dict = parser.parse_string(parser.bank_to_customer_statement, bytes(strip_namespace(input_data), 'utf8'))
statements = pd.DataFrame.from_dict(camt_dict['statements'])
all_entries = []
for i, _ in statements.iterrows():
if 'entries' in camt_dict['statements'][i]:
df = pd.DataFrame()
dd = pd.DataFrame.from_records(camt_dict['statements'][i]['entries'])
dg = dd['entry_details']
df['Date'] = dd['value_date'].str['date']
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%d-%m-%Y')
iban = camt_dict['statements'][i]['account']['id']['iban']
df['IBAN'] = iban
df['Currency'] = dd['amount'].str['currency']
# Sort Credit/Debit in separate Columns
df['Credit'] = np.where(dd['credit_debit_indicator'] == 'CRDT', dd['amount'].str['_value'], '')
df['Debit'] = np.where(dd['credit_debit_indicator'] == 'DBIT', dd['amount'].str['_value'], '')
# Get destination IBAN
getlength = len(dg.index) #2
for i in range(0, getlength):
result = str(dd['entry_details'][i])
print(result "Resultat " str(i))
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
print('the search is none')
df['Test'] = 'None'
else:
print('the search is a match')
df['Test'] = 'Yes'
all_entries.append(df)
df_entries = pd.concat(all_entries)
print(df_entries)
**My problem here is just with this code block **
for i in range(0, getlength):
result = str(dd['entry_details'][i])
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
df['Test'] = 'None'
else:
df['Test'] = search_for_iban.group()
all_entries.append(df)
I have already tried to solve various things via the index, this also counts cleanly high in the variable i and the getlength is also correct for 2 entries
What im expecting If there is an IBAN number in the 'search_for_iban' (which is using regex lookup (re.search)) which is matching in 2nd row i want that iban just in 2nd row (dataframe) "Test" as follows:
What im getting I got double the entry in row 1 and 2 although none was found in row 1. What am i overlooking, my head is hurting! :D
i think i am making a thinking error here between normal for loop and panda entries
CodePudding user response:
You can try:
for i in range(0, getlength):
.
.
.
else:
df.loc[i, 'Test'] = search_for_iban