I have a pandas dataframe with a a set of values (prices). Within each group of initiator_id
I need to sort prices ascending if the type == sell
, and descending if the type == buy
.
Then I add an id within each group. Right now I do:
df['bidnum'] = df.groupby(['initiator_id', 'type']).cumcount()
What is the efficient way to sort ascending within each group 'initiator_id', 'type == sell'
and descending for 'initiator_id', 'type == buy'
?
Here is how the original dataset looks like now:
initiator_id price type bidnum
1 170.81 sell 0
2 170.81 sell 0
2 169.19 buy 0
3 170.81 sell 0
3 169.19 buy 0
3 70.81 sell 1
4 170.81 sell 0
4 169.19 buy 0
4 70.81 sell 1
4 69.19 buy 1
and I need something like:
initiator_id, price, type
1, 100,sell
1, 99, sell
1, 98, sell
1, 110, buy
1, 120, buy
1, 125, buy
So that sell
subgroup within each initiator_id
group is sorted descending, and buy
subgroup is sorted ascending.
CodePudding user response:
If you can assume that your "price"
column will always contain non-negative values, we could "cheat". Assign a negative value to the prices of buy or sell operations, sort, and then calculate the absolute value to go back to the original prices:
If type is
"buy"
, the price remains positive (2 * 1 - 1 = 1). If type is"sell"
, the price will become negative (2 * 0 - 1 = -1).df["price"] = df["price"] * (2 * (df["type"] == "buy").astype(int) - 1)
Now sort values normally. I've included both
"initiator_id"
and"type"
columns to match your expected output:df = df.sort_values(["initiator_id", "type", "price"])
Finally, calculate the absolute value of the
"price"
column to retrieve your original values:df["price"] = df["price"].abs()
Expected output of this operation on your sample input:
initiator_id price type bidnum
0 1 170.81 sell 0
2 2 169.19 buy 0
1 2 170.81 sell 0
4 3 169.19 buy 0
3 3 170.81 sell 0
5 3 70.81 sell 1
9 4 69.19 buy 1
7 4 169.19 buy 0
6 4 170.81 sell 0
8 4 70.81 sell 1
CodePudding user response:
One solution:
final_df = pd.DataFrame()
grouped_df = df.groupby(['initiator_id', 'type'])
for key, item in grouped_df:
dfg = grouped_df.get_group(key).reset_index()
final_df = final_df.append(dfg.sort_values('price', ascending=(dfg.loc[0, 'type']=='buy')))
final_df.drop(final_df.columns[0], axis=1, inplace=True)
final_df.reset_index(inplace=True, drop=True)
Output:
initiator_id price type
0 1 170.81 sell
1 2 169.19 buy
2 2 170.81 sell
3 3 169.19 buy
4 3 170.81 sell
5 3 70.81 sell
6 4 69.19 buy
7 4 169.19 buy
8 4 170.81 sell
9 4 70.81 sell
CodePudding user response:
Everyone else gave a solution with pandas. Here I present a solution without pandas.
The input CSV:
initiator_id,price,type,bidnum
1,170.81,sell,0
2,170.81,sell,0
2,169.19,buy,0
3,170.81,sell,0
3,169.19,buy,0
3,70.81,sell,1
4,170.81,sell,0
4,169.19,buy,0
4,70.81,sell,1
4,69.19,buy,1
The output CSV:
initiator_id,price,type,bidnum
1,170.81,sell,0
2,170.81,sell,0
2,169.19,buy,0
3,170.81,sell,0
3,70.81,sell,1
3,169.19,buy,0
4,170.81,sell,0
4,70.81,sell,1
4,69.19,buy,1
4,169.19,buy,0
The code:
from collections import OrderedDict
import numpy
"""
the reason why this code uses exec is so that the ordering of columns can be arbitrary
"""
def remove_duplicates(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
def returnLastIndex(temp2):
global mydict
temp3 = mydict['initiator_id'][temp2]
while True:
temp2 = temp2 1
try:
if mydict['initiator_id'][temp2] != temp3:
return temp2-1
except:
return temp2-1
def returnFirstIndex(temp2):
global mydict
temp3 = mydict['initiator_id'][temp2]
while temp2 >= 1:
temp2 = temp2 - 1
if mydict['initiator_id'][temp2] != temp3:
return temp2 1
return 0
with open("input.csv") as file:
lines = file.readlines()
new_lines = []
new_headers = []
for x in range(len(lines)): #loop to reamove headers and newlines
if x == 0:
for y in lines[x].strip().split(","):
new_headers.append(y)
else:
new_lines.append(lines[x].strip())
mydict = OrderedDict()
for x in new_headers:
exec("mydict['" x "'] = []")
for x in range(len(new_headers)):
for y in new_lines:
if new_headers[x] == "initiator_id":
exec("mydict['" new_headers[x] "'].append(int('" y.split(",")[x] "'))")
elif new_headers[x] == "price":
exec("mydict['" new_headers[x] "'].append(float('" y.split(",")[x] "'))")
else:
exec("mydict['" new_headers[x] "'].append('" y.split(",")[x] "')")
for x in new_headers:
exec("mydict['" x "'] = numpy.array(mydict['" x "'])")
temp1 = mydict['initiator_id'].argsort()
for x in (new_headers):
exec("mydict['" x "'] = mydict['" x "'][temp1]")
splice_list_first = []
for x in range(len(mydict['initiator_id'])):
splice_list_first.append(returnFirstIndex(x))
splice_list_last = []
for x in range(len(mydict['initiator_id'])):
splice_list_last.append(returnLastIndex(x))
splice_list_first = remove_duplicates(splice_list_first)
splice_list_last = remove_duplicates(splice_list_last)
master_string = ",".join(new_headers) "\n"
for x in range(len(splice_list_first)):
temp4 = OrderedDict()
for y in new_headers:
exec("temp4['" y "'] = mydict['" y "'][" str(splice_list_first[x]) ":" str(splice_list_last[x] 1) "]")
sell_index = []
buy_index = []
for z in range(len(temp4['type'])):
if temp4['type'][z] == "sell":
sell_index.append(z)
if temp4['type'][z] == "buy":
buy_index.append(z)
temp5 = OrderedDict()
for a in range(len(sell_index)):
for b in new_headers:
try:
exec("temp5['" b "']")
except:
exec("temp5['" b "'] = []")
exec("temp5['" b "'].append(temp4['" b "'][" str(sell_index[a]) ":" str(sell_index[a] 1) "][0])")
try:
for c in new_headers:
exec("temp5['" c "'] = numpy.array(temp5['" c "'])")
temp7 = temp5['price'].argsort()[::-1]
for d in (new_headers):
exec("temp5['" d "'] = temp5['" d "'][temp7]")
for e in range(len(temp5['initiator_id'])):
for f in new_headers:
master_string = master_string str(temp5[f][e]) ","
master_string = master_string[:-1] "\n"
except Exception as g:
pass
temp6 = OrderedDict()
for a in range(len(buy_index)):
for b in new_headers:
try:
exec("temp6['" b "']")
except:
exec("temp6['" b "'] = []")
exec("temp6['" b "'].append(temp4['" b "'][" str(buy_index[a]) ":" str(buy_index[a] 1) "][0])")
try:
for c in new_headers:
exec("temp6['" c "'] = numpy.array(temp6['" c "'])")
temp7 = temp6['price'].argsort()
for d in (new_headers):
exec("temp6['" d "'] = temp6['" d "'][temp7]")
for e in range(len(temp6['initiator_id'])):
for f in new_headers:
master_string = master_string str(temp6[f][e]) ","
master_string = master_string[:-1] "\n"
except Exception as g:
pass
print(master_string)
f = open("output.csv", "w")
f.write(master_string)
f.close()