import pandas as pd
data = {'A': [1,2],
'B':[[1,1,1,2,2,4,4,4,4],[5, 4, 8, 1, 1, 1, 3, 2, 4, 2, 2, 2, 1, 1, 1]]}
df = pd.DataFrame(data)
A | B |
---|---|
1 | [1, 1, 1, 2, 2, 4, 4, 4, 4] |
2 | [5, 4, 8, 1, 1, 1, 3, 2, 4, 2, 2, 2, 1, 1, 1] |
def top_frequent(a):
import numpy
k = {}
for j in a:
if j in k:
k[j] =1
else:
k[j] =1
occ = []
for key, val in k.items():
occ.append(val)
Z = numpy.percentile(occ, 75, interpolation='higher')
print(Z)
bucket = [[] for l in range(len(a) 1)]
for key, val in k.items():
if val >= Z :
if val != 1 :
bucket[val].append(key)
res = []
for i in reversed(range(len(bucket))):
if bucket[i]:
res.extend(bucket[i])
return res
df['C'] = df.apply(top_frequent(df['B']))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_13728/2052560572.py in <module>
28 return res
29
---> 30 df['C'] = df.apply(top_frequent(df['B']))
~\AppData\Local\Temp/ipykernel_13728/2052560572.py in top_frequent(ids)
4 k = {}
5 for j in ids:
----> 6 if j in k:
7 k[j] =1
8 else:
TypeError: unhashable type: 'list'
When I apply the function on just one row it works fine But when I apply it for all lines I get this error : TypeError: unhashable type: 'list'
CodePudding user response:
The problem is that when you pass df['B']
into top_frequent()
, df['B']
is a column of list, you can view is as a list of list.
So in your for j in a:
, you are getting item from outer list. For list of list, what you get is a list.
Then in k[j]
, you are using a list as key which is not supported by Python list. So it gives you the error TypeError: unhashable type: 'list'
.
You can try
df['C'] = df['B'].apply(top_frequent)
# or
df['C'] = df.apply(lambda row: top_frequent(row['B']), axis=1)
Besides you can use a more pandas way to do this
df['C'] = df['B'].apply(lambda x: (lambda y: (y[y==y.max()].index.tolist()))(pd.Series(x).value_counts()))