I have a df with multiple columns like this (there are many more cols & rows):
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy',},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarkt':'wf'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k' },
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' }
])
my expected df should look like this in the end:
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy','new_col_1':'all'},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m','new_col_1':'[5000]'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z','new_col_1':'all'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarket':'wf','new_col_1':'[77]'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf','new_col_1':'all'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k', 'new_col_1':'[64]', 'new_col_2':'[2]'},
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' ,'new_col_1':'all'}
])
and here are multiple conditions I want to apply on cols fruit_code & vegetable_code to get two new columns:
UPDATE
def fruits_vegetable(row):
if len(str(row['fruit_code'])) == 1: # fruit_code in new_col_1
row['new_col_1'] = row['fruit_code']
elif len(str(row['fruit_code'])) == 1 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) > 2 and len(str(row['vegetable_code'])) == 1: # vegetable_code in new_col_1
row['new_col_1'] = row['vegetable_code']
elif len(str(row['fruit_code'])) > 3 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) == 2 and len(str(row['vegetable_code'])) >= 0: # fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = row['fruit_code'][0]
row['new_col_2'] = row['fruit_code'][1]
return row
df = df.apply(fruits_vegetable, axis=1)
I'm still stuck, now I get "all" in some of the rows for the first column, but the second does not change.
If someone has some insights, that would be great.
Thanks, much appreciated
CodePudding user response:
First is necessary convert strings repr of lists by ast.literal_eval
to lists, then for chceck length remove casting to strings. If need one element lists instead scalars use []
in fruit[0]
and fruit[1]
and last change order of condition for len(fruit) == 1
, also change len(fruit) > 3
to len(fruit) > 2
for match first row:
def fruits_vegetable(row):
fruit = ast.literal_eval(row['fruit_code'])
vege = ast.literal_eval(row['vegetable_code'])
if len(fruit) == 1 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) > 2 and len(vege) == 1: # vegetable_code in new_col_1
row['new_col_1'] = vege
elif len(fruit) > 2 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) == 2 and len(vege) >= 0:# fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = [fruit[0]]
row['new_col_2'] = [fruit[1]]
elif len(fruit) == 1: # fruit_code in new_col_1
row['new_col_1'] = fruit
return row
df = df.apply(fruits_vegetable, axis=1)
print (df)
ID date fruit_code new_col_1 new_col_2 supermarket \
0 1 2022-01-01 [100,99,300] all NaN xy
1 2 2022-01-01 [67,200,87] [5000] NaN z, m
2 3 2021-01-01 [100,5,300,78] all NaN wf, z
3 4 2020-01-01 [77] [77] NaN NaN
4 5 2022-15-01 [100,200,546,33] all NaN t, wf
5 6 2002-12-01 [64,2] [64] [2] k
6 7 2018-12-01 [5] all NaN p
supermarkt vegetable_code
0 NaN [1000,2000,3000]
1 NaN [5000]
2 NaN [7000,2000,3000]
3 wf [1000]
4 NaN [4000,2000,3000]
5 NaN [6000,8000,1000]
6 NaN [6000,8000,1000]