I have 2 Data Frames df1 and df2 containing thousands of rows and I want to pick the top 20 items based on the highest Gross Margin and cost less than 100 from df2 from a certain group category and assign those to the same category in df1 depending.
here are the Data-Frames and output
df1= pd.DataFrame({'group-id':['00-1111','00-1111','00-1111','00-1111','01-1112','01-1112','01-1112','02-1113','02-1113','02-1113','02-1113'],'group-nm': ['apples','mangos','bananas','pears','bananas','orange','pears','pears','mangos','bananas','pineapple']})
df2 = pd.DataFrame({'group-nm':['pears','orange','orange','orange','pineapple','orange','apples','pears','apples','orange','orange','mangos','orange','bananas','orange','bananas','pears','apples','pears','pineapple','mangos','bananas','bananas','pears','apples','apples','bananas','mangos','pears','pineapple','pears','apples','pears','mangos','orange','orange','apples','mangos','apples','mangos','orange','mangos','apples','apples','mangos','mangos','pears','mangos','pineapple','pineapple','mangos','pears','orange','pineapple','apples','bananas','bananas','pineapple','pears','bananas','orange','pears','orange','orange','apples','apples','bananas','mangos','pineapple','orange','mangos','pears','mangos','mangos','bananas','mangos','orange','pineapple','bananas','bananas','pineapple','mangos','apples','mangos','bananas','mangos','apples','orange','pears','apples','pineapple','mangos','apples','mangos','orange','mangos','pineapple','apples','bananas','apples','pears','orange','pears','apples','apples','pineapple','bananas','pineapple','pineapple','mangos','pears','pineapple','apples','mangos','orange','apples','mangos','pineapple','orange','apples','pears','pears','bananas','mangos','bananas','orange','mangos','orange','mangos','orange','bananas','mangos','apples','orange','pears','orange','bananas','pineapple','mangos','mangos','pears','bananas','apples','mangos','pineapple','apples','bananas','pineapple','apples','apples','apples','pineapple','mangos','bananas','apples','bananas','pineapple','orange','pineapple','apples'],'group-items': ['pears6','orange27','orange24','orange5','pineapple13','orange22','apples15','pears15','apples21','orange28','orange21','mangos14','orange1','bananas13','orange2','bananas6','pears21','apples1','pears9','pineapple3','mangos11','bananas18','bananas2','pears16','apples29','apples11','bananas19','mangos23','pears11','pineapple7','pears14','apples30','pears12','mangos29','orange8','orange16','apples6','mangos24','apples14','mangos5','orange23','mangos33','apples26','apples22','mangos27','mangos4','pears2','mangos32','pineapple18','pineapple15','mangos21','pears13','orange3','pineapple1','apples13','bananas17','bananas15','pineapple4','pears10','bananas11','orange12','pears17','orange25','orange13','apples20','apples8','bananas5','mangos13','pineapple11','orange9','mangos10','pears3','mangos16','mangos8','bananas10','mangos22','orange19','pineapple9','bananas9','bananas14','pineapple10','mangos26','apples28','mangos12','bananas21','mangos20','apples23','orange18','pears4','apples3','pineapple2','mangos3','apples19','mangos18','orange10','mangos15','pineapple23','apples12','bananas1','apples9','pears1','orange20','pears5','apples31','apples17','pineapple12','bananas4','pineapple5','pineapple20','mangos1','pears20','pineapple14','apples32','mangos30','orange14','apples25','mangos2','pineapple21','orange4','apples16','pears8','pears18','bananas12','mangos17','bananas8','orange15','mangos28','orange17','mangos7','orange7','bananas3','mangos31','apples5','orange6','pears7','orange26','bananas20','pineapple6','mangos19','mangos25','pears19','bananas23','apples2','mangos9','pineapple22','apples7','bananas22','pineapple17','apples10','apples4','apples27','pineapple16','mangos6','bananas7','apples24','bananas16','pineapple19','orange11','pineapple8','apples18'], 'gross-margin': [100,99,99,99,99,98,97,96,96,95,94,94,94,93,93,92,91,91,91,91,90,89,88,87,86,82,82,82,82,82,80,80,79,78,78,76,76,76,75,74,74,73,73,73,73,73,72,72,72,71,71,70,70,70,69,68,68,67,67,65,65,64,63,62,62,59,59,59,59,59,58,57,57,56,54,53,52,52,51,51,51,49,49,49,49,48,48,47,47,47,47,46,45,44,43,43,43,42,41,41,40,40,40,40,39,38,37,35,35,34,34,33,33,32,31,31,29,29,28,28,28,28,27,26,26,26,25,24,24,24,21,21,18,16,15,15,15,14,14,13,12,11,11,11,10,9,9,9,9,8,7,5,5,4,4,4,3,2,2,1],'cost':[85,86,34,77,92,51,46,14,24,19,32,60,58,107,51,17,90,25,7,7,24,28,36,67,71,40,85,52,106,23,45,50,2,5,27,79,45,8,80,17,65,20,98,106,39,103,33,90,73,89,93,74,51,81,66,84,91,70,50,75,62,12,54,29,102,58,30,25,99,89,38,10,58,40,34,59,58,56,66,56,61,55,9,41,83,58,72,1,92,53,73,68,81,85,47,107,55,96,22,67,52,35,43,34,77,64,55,5,13,66,89,66,108,13,24,39,37,20,37,17,70,58,89,69,98,81,56,38,77,14,15,28,30,45,49,110,78,70,4,77,44,22,53,16,83,74,3,90,36,87,1,57,78,17,38,45,66,5,87,32]})
# I am trying to get out-put data Frame like this
output= pd.DataFrame({'group-id':['00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','00-1111','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112','01-1112'],
'group-nm': ['apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','apples','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','mangos','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','bananas','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','orange','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears','pears'],
'group-itm':['apples15','apples21','apples1','apples29','apples11','apples30','apples6','apples14','apples26','apples13','apples8','apples28','apples23','apples3','apples19','apples12','apples9','apples31','apples17','apples25','mangos14','mangos11','mangos23','mangos29','mangos24','mangos5','mangos27','mangos33','mangos32','mangos21','mangos13','mangos10','mangos16','mangos8','mangos22','mangos12','mangos26','mangos20','mangos3','mangos18','bananas6','bananas18','bananas2','bananas19','bananas15','bananas17','bananas11','bananas5','bananas10','bananas14','bananas9','bananas21','bananas1','bananas4','bananas12','bananas8','bananas3','bananas20','bananas23','bananas22','pears6','pears15','pears21','pears9','pears16','pears14','pears12','pears2','pears13','pears10','pears17','pears3','pears4','pears1','pears5','pears20','pears18','pears8','pears7','pears19','bananas6','bananas18','bananas2','bananas19','bananas15','bananas17','bananas11','bananas5','bananas10','bananas14','bananas9','bananas21','bananas1','bananas4','bananas12','bananas8','bananas3','bananas20','bananas23','bananas22','orange24','orange27','orange5','orange22','orange28','orange1','orange21','orange2','orange8','orange16','orange23','orange3','orange12','orange25','orange13','orange9','orange19','orange18','orange10','orange20','pears6','pears15','pears21','pears9','pears16','pears14','pears12','pears2','pears13','pears10','pears17','pears3','pears4','pears1','pears5','pears20','pears18','pears8','pears7','pears19']})
I have tried group-by rank method
df2['rank']= df2.groupby(['group-nm'])['gross-margin'].rank("first", ascending=False)
df2['rank-cst']= df2.groupby(['group-nm'])['cost'].rank("first", ascending=False)
df = df2.loc[df2['rank']<= 20]
df = df2.loc[df2['rank-cst'] <= 20]
out_put= df1.merge(df[["group-nm","group_itm")]], how="left", on=["group-nm"])
but can not get the desired output.
CodePudding user response:
I can think of the following:
- Filter df2 where cost<100
- Sort by price
- Merge
- Groupby and select the top 20
Edited to fix the head properly with group ID group name
df2fs = df2[df2.cost<100].sort_values('gross-margin', ascending=False)
dfall = df1.merge(df2fs,how='outer',left_on="group-nm", right_on="group-nm")
dfoutput= dfall.groupby(["group-id","group-nm"]).head(20)
group-id group-nm group-items gross-margin cost
0 00-1111 apples apples15 97 46
1 00-1111 apples apples21 96 24
2 00-1111 apples apples1 91 25
3 00-1111 apples apples29 86 71
4 00-1111 apples apples11 82 40
.. ... ... ... ... ...
259 02-1113 pineapple pineapple14 33 66
260 02-1113 pineapple pineapple21 29 20
261 02-1113 pineapple pineapple6 14 70
262 02-1113 pineapple pineapple22 10 83
263 02-1113 pineapple pineapple17 9 90
With only top-2 items, to check output:
>>> df1.merge(df2fs,how='outer',left_on="group-nm", right_on="group-nm").groupby(["group-id","group-nm"]).head(2)
group-id group-nm group-items gross-margin cost
0 00-1111 apples apples15 97 46
1 00-1111 apples apples21 96 24
29 00-1111 mangos mangos14 94 60
30 00-1111 mangos mangos11 90 24
60 02-1113 mangos mangos14 94 60
61 02-1113 mangos mangos11 90 24
91 00-1111 bananas bananas6 92 17
92 00-1111 bananas bananas18 89 28
113 01-1112 bananas bananas6 92 17
114 01-1112 bananas bananas18 89 28
135 02-1113 bananas bananas6 92 17
136 02-1113 bananas bananas18 89 28
157 00-1111 pears pears6 100 85
158 00-1111 pears pears15 96 14
177 01-1112 pears pears6 100 85
178 01-1112 pears pears15 96 14
197 02-1113 pears pears6 100 85
198 02-1113 pears pears15 96 14
217 01-1112 orange orange5 99 77
218 01-1112 orange orange27 99 86
244 02-1113 pineapple pineapple13 99 92
245 02-1113 pineapple pineapple3 91 7
Edit2 Adding rows until 20.
This code is probably inefficient, and quite ugly IMO, but it would do the trick
temp = (20-dfoutput.groupby(["group-id","group-nm"])["group-items"].count()).apply(lambda x: x*[pd.NA]).reset_index().explode("group-items")
temp['group-items'] = temp['group-items'].fillna(kk['group-nm'] '999')
pd.concat([dfoutput, temp]).sort_values(['group-id','group-nm'])
# Example output for head-2 and fill-4 items
group-id group-nm group-items gross-margin cost
0 00-1111 apples apples15 97.0 46.0
1 00-1111 apples apples21 96.0 24.0
0 00-1111 apples apples999 NaN NaN
0 00-1111 apples apples999 NaN NaN
91 00-1111 bananas bananas6 92.0 17.0
92 00-1111 bananas bananas18 89.0 28.0
1 00-1111 bananas bananas999 NaN NaN
1 00-1111 bananas bananas999 NaN NaN
29 00-1111 mangos mangos14 94.0 60.0
30 00-1111 mangos mangos11 90.0 24.0
2 00-1111 mangos mangos999 NaN NaN
2 00-1111 mangos mangos999 NaN NaN
157 00-1111 pears pears6 100.0 85.0
158 00-1111 pears pears15 96.0 14.0
3 00-1111 pears pears999 NaN NaN
3 00-1111 pears pears999 NaN NaN
113 01-1112 bananas bananas6 92.0 17.0
114 01-1112 bananas bananas18 89.0 28.0
4 01-1112 bananas bananas999 NaN NaN
4 01-1112 bananas bananas999 NaN NaN
217 01-1112 orange orange5 99.0 77.0
218 01-1112 orange orange27 99.0 86.0
5 01-1112 orange orange999 NaN NaN
5 01-1112 orange orange999 NaN NaN
177 01-1112 pears pears6 100.0 85.0
178 01-1112 pears pears15 96.0 14.0
6 01-1112 pears pears999 NaN NaN
6 01-1112 pears pears999 NaN NaN
CodePudding user response:
Your question is a little unclear, but I did some work on it in what I believe your intent. I have also included some comments so you can change what is necessary to fill your requirements.
df2['Check'] = np.where(df2['gross-margin'] < 100, True, False)
df2 = df2.loc[df2['Check'] == True]
df2['RN'] = df2.sort_values(['gross-margin'], ascending = False).groupby(['group-nm']).cumcount() 1
df2 = df2.loc[df2['RN'].astype(int) <= 20]
df_merge = pd.merge(df2, df1, how = 'right', left_on = ['group-nm'], right_on = ['group-nm'])
df_merge