I have a dataframe new_df
as shown below .
I have issue that my current code contain replace if numeric value or special character value comes after character value in column in dataframe like 70317380CH71
or 70317380CH%(
Current output Dataframe
riskID locationCode locationCity locationCountryCode
70317380CH71 CH71 PRATTELN CH
70520366HONGKONGHK EU_LL1_H_HONG_KONG_HKG HONG KONG HK
70729363MIDDLETOWNUS US_06457_MAZ_000023 MIDDLETOWN US
70317380CH&? CH&? GALE CH
Current Code
# find rows with value in country and city
m1 = new_df[['locationCity', 'locationCountryCode']].notna().all(axis=1)
# find rows with a "_"
m2 = new_df['riskID'].str.contains('-|_', na=False)
#m3= new_df['locationCode'].str.contains('_')
# both conditions above
m = m1&m2
#n =m1&m3
# replace matching rows by number city country
new_df.loc[m, 'riskID'] = (new_df.loc[m, 'riskID'].str.extract('^(\d )', expand=False)
new_df.loc[m, 'locationCity'].str.replace(' ', '') new_df.loc[m, 'locationCountryCode'])
Expected Output
riskID locationCode locationCity locationCountryCode
70317380PRATTELNCH CH71 PRATTELN CH
70520366HONGKONGHK EU_LL1_H_HONG_KONG_HKG HONG KONG HK
70729363MIDDLETOWNUS US_06457_MAZ_000023 MIDDLETOWN US
70317380GALECH CH&? GALE CH
How can this be done in pythin
CodePudding user response:
here are two options, both will have the same outcome. comment added with code.
option1: replace locationCode
with locationCity
locationCountryCode
import pandas as pd
from io import StringIO
raw_data='''
riskID,locationCode,locationCity,locationCountryCode
70317380CH71,CH71,PRATTELN,CH
70520366HONGKONGH,EU_LL1_H_HONG_KONG_HKG,HONG KONG,HK
70729363MIDDLETOWNUS,US_06457_MAZ_000023,MIDDLETOWN,US
70317380CH&?,CH&?,GALE,CH
'''
df=pd.read_csv(StringIO(raw_data))
df1=df.copy()
def change_risk_id(row): # function to handle the replacement
risk_id=row['riskID']
return risk_id.replace(row['locationCode'],row['locationCity'] row['locationCountryCode'])
df.insert(0,'riskID_new',df.apply(change_risk_id,axis=1))
df=df.drop('riskID',axis=1) # drop the old column
df = df.rename(columns={'riskID_new': 'riskID'}) # rename the column
option2: use regex to find the pattern and then replace
import pandas as pd
from io import StringIO
def change_risk_id_req_ex(row): # function to handle the replacement
re_exp='(?:\d )(\S [\d\?&])*' # regular exp to find the pattern try here- https://regex101.com/r/e37o3D/1
risk_id=row['riskID']
search_results=re.search(re_exp,risk_id)
if search_results.group(1):
return risk_id.replace(search_results.group(1),row['locationCity'] row['locationCountryCode'])
else:
return risk_id
raw_data='''
riskID,locationCode,locationCity,locationCountryCode
70317380CH71,CH71,PRATTELN,CH
70520366HONGKONGH,EU_LL1_H_HONG_KONG_HKG,HONG KONG,HK
70729363MIDDLETOWNUS,US_06457_MAZ_000023,MIDDLETOWN,US
70317380CH&?,CH&?,GALE,CH
'''
df=pd.read_csv(StringIO(raw_data))
df1=df.copy()
df.insert(0,'riskID_new',df.apply(change_risk_id_req_ex,axis=1))
df=df.drop('riskID',axis=1) # drop the old column
df = df.rename(columns={'riskID_new': 'riskID'}) # rename the column