Convert a pandas df to defaultdict and look for matching positive and negative numbers-CodePudding

I have a dataframe like this:

from collections import defaultdict
import pandas as pd

d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
     'label':['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA']
    ,'amount':[2,-12,12,-12,5,-5,2,3,5,3,3,10,-10]}
df = pd.DataFrame(d)
print(df)
    id  label   amount
0   1   A       2
1   1   A      -12
2   1   B       12
3   1   G      -12
4   2   A       5
5   2   BB     -5
6   3   C      2
7   3   C      3
8   3   A      5
9   4   BB     3
10  4   B      3
11  4   AA    10
12  4   AA   -10

What i want to do know i convert the df into a defaultdict and look for only the matching matching positive and negative numbers per id so i want my desired dict to look something like this:

defaultdict(list,
            {1: [{'label': 'B', 'amount': 12},
                 {'label': 'A', 'amount': -12},
                 {'label': 'G', 'amount': -12}],
             2: [{'label': 'C',  'amount': 5}, {'label': 'A', 'amount': -5}],
             4: [{'label': 'AA', 'amount': 10},
                {'label': 'AA', 'amount': -10}]})

I try to do this below but i get this error:

d = defaultdict(list)
for  index,row in df.iterrows():
    if row['amount'].isin(-row['amount']):
        dd[row["id"]].append(
            {  "label": row["label"],
                'amount':row['amount'] })



----> 3     if row['amount'].isin(-row['amount']):
      4         dd[row["id"]].append(
      5             {  "label": row["label"],

AttributeError: 'int' object has no attribute 'isin'

Any idea how to solve this would be very appreciated. Thanks!

CodePudding user response：

The problem here is that row['amount'] is actually an integer and the .isin() cannot be applied on integers, it's applied on dataframe or series

----> 3     if row['amount'].isin(-row['amount']):
      4         dd[row["id"]].append(
      5             {  "label": row["label"],

AttributeError: 'int' object has no attribute 'isin'

You can change that part of the code to check whether int is in list like so:

df['negative_amount'] = df['amount'] * -1
d = defaultdict(list)
for  index,row in df.iterrows():
    if row['amount'] in df['negative_amount'].tolist():
        d[row["id"]].append(
            {  "label": row["label"],
                'amount':row['amount'] })

Output

defaultdict(list,
            {1: [{'label': 'A', 'amount': -12},
              {'label': 'B', 'amount': 12},
              {'label': 'G', 'amount': -12}],
             2: [{'label': 'A', 'amount': 5}, {'label': 'BB', 'amount': -5}],
             3: [{'label': 'A', 'amount': 5}],
             4: [{'label': 'AA', 'amount': 10},
              {'label': 'AA', 'amount': -10}]})

CodePudding user response：

In pandas iterrows is not recommneded, check this answer. Alternative pandas only solution with same logic like your solution - per groups test if at least one match, filter by boolean indexing and convert to dictioanry per id with DataFrame.to_dict:

def f(x):
    a = x.to_numpy()
    return np.any(a == -a[:, None], axis=1)
 

d = (df[df.groupby('id')['amount'].transform(f)]
       .groupby('id')[['label','amount']]
       .apply(lambda x: x.to_dict(orient='records'))
       .to_dict())
  
print (d)
{1: [{'label': 'A', 'amount': -12}, 
     {'label': 'B', 'amount': 12},
     {'label': 'G', 'amount': -12}], 
 2: [{'label': 'A', 'amount': 5},
     {'label': 'BB', 'amount': -5}], 
 4: [{'label': 'AA', 'amount': 10},
     {'label': 'AA', 'amount': -10}]}

Or is possible filter all duplicated with absolute amount, but also is necessary test if exist negative with DataFrameGroupBy.nunique per groups:

f = lambda x: x.to_dict(orient='records')
df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) & 
     df1.duplicated(['id','amount'], keep=False))

d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
print (d)
{1: [{'label': 'A', 'amount': -12}, 
     {'label': 'B', 'amount': 12},
     {'label': 'G', 'amount': -12}],
 2: [{'label': 'A', 'amount': 5}, 
     {'label': 'BB', 'amount': -5}],
 4: [{'label': 'AA', 'amount': 10}, 
     {'label': 'AA', 'amount': -10}]}

If small DataFrame and performance not important need test values per groups by id, for test use == with Series.any:

from collections import defaultdict

df['negative_amount'] = - df['amount']
d = defaultdict(list)

for i, g in df.groupby('id'):
    for  index,row in g.iterrows():
        if (g['negative_amount'] == row['amount']).any():
            d[i].append({  "label": row["label"], 'amount':row['amount'] })
            
print (d)
defaultdict(<class 'list'>, {1: [{'label': 'A', 'amount': -12},
                                 {'label': 'B', 'amount': 12},
                                 {'label': 'G', 'amount': -12}], 
                             2: [{'label': 'A', 'amount': 5}, 
                                 {'label': 'BB', 'amount': -5}],
                             4: [{'label': 'AA', 'amount': 10},
                                 {'label': 'AA', 'amount': -10}]})

Performance in 10k rows and 1k groups, last solution is slowiest:

np.random.seed(123)

N = 10000
d = {'id': np.random.randint(1000, size=N),
     'label':np.random.choice(['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA'], size=N),
     'amount':np.random.choice([2,-12,12,-12,5,-5,2,3,5,3,3,10,-10], size=N)}
df = pd.DataFrame(d).sort_values('id', ignore_index=True)
print(df)

In [47]: %%timeit
    ...: def f(x):
    ...:     a = x.to_numpy()
    ...:     return np.any(a == - a[:, None], axis=1)
    ...: 
    ...: d = (df[df.groupby('id')['amount'].transform(f)]
    ...:         .groupby('id')[['label','amount']]
    ...:         .apply(lambda x: x.to_dict(orient='records'))
    ...:         .to_dict())
    ...:         
225 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [48]: %%timeit
    ...: f = lambda x: x.to_dict(orient='records')
    ...: df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
    ...: m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) & 
    ...:      df1.duplicated(['id','amount'], keep=False))
    ...: 
    ...: d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
    ...: 
124 ms ± 9.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [49]: %%timeit
    ...: df['negative_amount'] = - df['amount']
    ...: d = defaultdict(list)
    ...: 
    ...: for i, g in df.groupby('id'):
    ...:     for  index,row in g.iterrows():
    ...:         if (g['negative_amount'] == row['amount']).any():
    ...:             d[i].append({  "label": row["label"], 'amount':row['amount'] })
    ...:             
3.51 s ± 366 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)