I need to transform this dataframe
data = {'CODES': ['ABC=V; XZE=6,0; FYK=5,0',
'RTK=6,0;XZE=6,0',
'FYK=5,0; RTK=6,0;ABC=V,'],
'CUSTOMER': ['CUSTOMER1','CUSTOMER2,','CUSTOMER3',]
}
CUSTOMERCODES = pd.DataFrame(data)
into this
data2 = {'ABC': ['V',np.NaN,'V'],
'XZE': [6.0,6.0, np.NaN],
'FYK': [5.0,np.NaN, 5.0],
'RTK': [np.NaN,6.0, 6.0],
'CUSTOMER': ['CUSTOMER1','CUSTOMER2,','CUSTOMER3']
}
CUSTOMERCODES2 = pd.DataFrame(data2)
How can i split the codes column and put the corresponding values under the right columns? (and convert text values into numeric)
CodePudding user response:
Like @Olvin Roght, I am also note aware of any pandas
functionality that can achieve this (doesn't mean there isn't one). But I think you can just iterate over the exploded series and assign each value-pair to an empty data frame for the exact index position. See the following code that does this for your example:
data = {'CODES': ['ABC=V; XZE=6,0; FYK=5,0',
'RTK=6,0;XZE=6,0',
'FYK=5,0; RTK=6,0;ABC=V,'],
'CUSTOMER': ['CUSTOMER1','CUSTOMER2,','CUSTOMER3',]
}
CUSTOMERCODES = pd.DataFrame(data)
ser = CUSTOMERCODES['CODES'].str.split(";").explode()
CUSTOMERCODES2 = pd.DataFrame()
for i, v in zip(ser.index, ser.to_list()):
CUSTOMERCODES2.loc[i, v.split("=")[0].strip()] = v.split("=")[1].strip().replace(",", "")
pd.concat([CUSTOMERCODES2, CUSTOMERCODES.drop(columns=['CODES'])], axis=1)
-------------------------------
ABC XZE FYK RTK CUSTOMER
0 V 60 50 NaN CUSTOMER1
1 NaN 60 NaN 60 CUSTOMER2,
2 V NaN 50 60 CUSTOMER3
-------------------------------
CodePudding user response:
Code:
col = ['CUSTOMER'] #creating list to assign the column names
val = {} #
for i,j in enumerate(CUSTOMERCODES.CODES):
val[CUSTOMERCODES.CUSTOMER[i]]= {}
for d in j.split(';'):
col.append(d.split('=')[0].replace(' ',''))
val[CUSTOMERCODES.CUSTOMER[i]][d.split('=')[0].replace(' ','')] = d.split('=')[1].replace(' ','')
col = list(set(col)) #to get the unique values
data = pd.DataFrame(columns=col) #create empty df with column names
for Cus, level2_dict in val.items():
data.loc[len(data.index), 'CUSTOMER'] = Cus.replace(',','')
for j in level2_dict:
data.loc[data.loc[data['CUSTOMER'] == Cus.replace(',','')].index, j] = level2_dict.get(j)
data
Output:
CUSTOMER FYK XZE ABC RTK
0 CUSTOMER1 5,0 6,0 V NaN
1 CUSTOMER2 NaN 6,0 NaN 6,0
2 CUSTOMER3 5,0 NaN V, 6,0