I have a dataframe like this:
from collections import defaultdict
import pandas as pd
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'label':['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA']
,'amount':[2,-12,12,-12,5,-5,2,3,5,3,3,10,-10]}
df = pd.DataFrame(d)
print(df)
id label amount
0 1 A 2
1 1 A -12
2 1 B 12
3 1 G -12
4 2 A 5
5 2 BB -5
6 3 C 2
7 3 C 3
8 3 A 5
9 4 BB 3
10 4 B 3
11 4 AA 10
12 4 AA -10
What i want to do know i convert the df into a defaultdict and look for only the matching matching positive and negative numbers per id so i want my desired dict to look something like this:
defaultdict(list,
{1: [{'label': 'B', 'amount': 12},
{'label': 'A', 'amount': -12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'C', 'amount': 5}, {'label': 'A', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]})
I try to do this below but i get this error:
d = defaultdict(list)
for index,row in df.iterrows():
if row['amount'].isin(-row['amount']):
dd[row["id"]].append(
{ "label": row["label"],
'amount':row['amount'] })
----> 3 if row['amount'].isin(-row['amount']):
4 dd[row["id"]].append(
5 { "label": row["label"],
AttributeError: 'int' object has no attribute 'isin'
Any idea how to solve this would be very appreciated. Thanks!
CodePudding user response:
The problem here is that row['amount'] is actually an integer and the .isin() cannot be applied on integers, it's applied on dataframe or series
----> 3 if row['amount'].isin(-row['amount']):
4 dd[row["id"]].append(
5 { "label": row["label"],
AttributeError: 'int' object has no attribute 'isin'
You can change that part of the code to check whether int is in list like so:
df['negative_amount'] = df['amount'] * -1
d = defaultdict(list)
for index,row in df.iterrows():
if row['amount'] in df['negative_amount'].tolist():
d[row["id"]].append(
{ "label": row["label"],
'amount':row['amount'] })
Output
defaultdict(list,
{1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5}, {'label': 'BB', 'amount': -5}],
3: [{'label': 'A', 'amount': 5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]})
CodePudding user response:
In pandas iterrows
is not recommneded, check this answer. Alternative pandas only solution with same logic like your solution - per groups test if at least one match, filter by boolean indexing
and convert to dictioanry per id
with DataFrame.to_dict
:
def f(x):
a = x.to_numpy()
return np.any(a == -a[:, None], axis=1)
d = (df[df.groupby('id')['amount'].transform(f)]
.groupby('id')[['label','amount']]
.apply(lambda x: x.to_dict(orient='records'))
.to_dict())
print (d)
{1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]}
Or is possible filter all duplicated with absolute amount
, but also is necessary test if exist negative with DataFrameGroupBy.nunique
per groups:
f = lambda x: x.to_dict(orient='records')
df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) &
df1.duplicated(['id','amount'], keep=False))
d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
print (d)
{1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]}
If small DataFrame and performance not important need test values per groups by id
, for test use ==
with Series.any
:
from collections import defaultdict
df['negative_amount'] = - df['amount']
d = defaultdict(list)
for i, g in df.groupby('id'):
for index,row in g.iterrows():
if (g['negative_amount'] == row['amount']).any():
d[i].append({ "label": row["label"], 'amount':row['amount'] })
print (d)
defaultdict(<class 'list'>, {1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]})
Performance in 10k
rows and 1k
groups, last solution is slowiest:
np.random.seed(123)
N = 10000
d = {'id': np.random.randint(1000, size=N),
'label':np.random.choice(['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA'], size=N),
'amount':np.random.choice([2,-12,12,-12,5,-5,2,3,5,3,3,10,-10], size=N)}
df = pd.DataFrame(d).sort_values('id', ignore_index=True)
print(df)
In [47]: %%timeit
...: def f(x):
...: a = x.to_numpy()
...: return np.any(a == - a[:, None], axis=1)
...:
...: d = (df[df.groupby('id')['amount'].transform(f)]
...: .groupby('id')[['label','amount']]
...: .apply(lambda x: x.to_dict(orient='records'))
...: .to_dict())
...:
225 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [48]: %%timeit
...: f = lambda x: x.to_dict(orient='records')
...: df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
...: m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) &
...: df1.duplicated(['id','amount'], keep=False))
...:
...: d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
...:
124 ms ± 9.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [49]: %%timeit
...: df['negative_amount'] = - df['amount']
...: d = defaultdict(list)
...:
...: for i, g in df.groupby('id'):
...: for index,row in g.iterrows():
...: if (g['negative_amount'] == row['amount']).any():
...: d[i].append({ "label": row["label"], 'amount':row['amount'] })
...:
3.51 s ± 366 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)