Calculate a difference in times between each pair of values in class using Pandas-CodePudding

I am trying to calculate the difference in the "time" column between each pair of elements having the same value in the "class" column. This is an example of an input:

  class   name                time
0     A    Bob 2022-09-05 07:22:15
1     A    Sam 2022-09-04 17:18:29
2     B    Bob 2022-09-04 03:29:06
3     B    Sue 2022-09-04 01:28:34
4     A  Carol 2022-09-04 10:40:23

And this is an output:

class  name1  name2        timeDiff
0     A    Bob  Carol 0 days 20:41:52
1     A    Bob    Sam 0 days 14:03:46
2     A  Carol    Sam 0 days 06:38:06
3     B    Bob    Sue 0 days 02:00:32

I wrote this code to solve this problem:

from itertools import combinations


df2 = pd.DataFrame(columns=['class', 'name1', 'name2', 'timeDiff'])

for c in df['class'].unique():
    df_class = df[df['class'] == c]
    groups = df_class.groupby(['name'])['time']

    if len(df_class) > 1:
        out = (pd
               .concat({f'{k1} {k2}': pd.Series(data=np.abs(np.diff([g2.values[0],g1.values[0]])).astype('timedelta64[s]'), index=[f'{k1} {k2}'], name='timeDiff')
                        for (k1, g1), (k2, g2) in combinations(groups, 2)},
                       names=['name']
                      )
               .reset_index()
              )
        new = out["name"].str.split(" ", n = -1, expand = True)

        out["name1"]= new[0].astype(str)
        out["name2"]= new[1].astype(str)
        out["class"] = c

        del out['level_1'], out['name']

        df2 = df2.append(out, ignore_index=True)

I didn't come up with a solution without going through all the class values in a loop. However, this is very time-consuming if the input table is large. Does anyone have any solutions without using a loop?

CodePudding user response：

The whole thing is a self cross join and a time difference

import pandas as pd

df = pd.DataFrame({
    'class': ['A', 'A', 'B', 'B', 'A'],
    'name': ['Bob', 'Sam', 'Bob', 'Sue', 'Carol'],
    'time': [
        pd.Timestamp('2022-09-05 07:22:15'),
        pd.Timestamp('2022-09-04 17:18:29'),
        pd.Timestamp('2022-09-04 03:29:06'),
        pd.Timestamp('2022-09-04 01:28:34'),
        pd.Timestamp('2022-09-04 10:40:23'),
    ]
})

rs = list()

for n, df_g in df.groupby('class'):
    t_df = df_g.merge(
        df_g, how='cross',
        suffixes=('_1', '_2')
    )
    t_df = t_df[t_df['name_1'] != t_df['name_2']]
    t_df = t_df.drop(['class_2'], axis=1)\
        .rename({'class_1': 'class'}, axis=1).reset_index(drop=True)
    t_df['timeDiff'] = abs(t_df['time_1'] - t_df['time_2'])\
        .astype('timedelta64[ns]')
    t_df = t_df.drop(['time_1', 'time_2'], axis=1)
    rs.append(t_df)

rs_df = pd.concat(rs).reset_index(drop=True)

CodePudding user response：

Check below code without Outerjoin, using Aggegrate & Itertools

from itertools import combinations

# Function to create list of names
def agg_to_list(value):
    return list(list(i) for i in combinations(list(value), 2))

# Fucntion to calculate list of time & calculate differences between them
def agg_to_list_time(value):
    return  [ t[0] - t[1] for t in  list(combinations(list(value), 2))]

# Apply aggregate functions
updated_df = df.groupby(['class']).agg({'name':agg_to_list,'time':agg_to_list_time})

# Explode DataFrame & rename column
updated_df = updated_df.explode(['name','time']).rename(columns={'time':'timediff'})

# Unpack name column in two new columns
updated_df[['name1','name2']] = pd.DataFrame(updated_df.name.tolist(), index=updated_df.index)

# Final DataFrame
print(updated_df.reset_index()[['class','name1','name2','timediff']])

Output:

  class name1  name2        timediff
0     A   Bob    Sam 0 days 14:03:46
1     A   Bob  Carol 0 days 20:41:52
2     A   Sam  Carol 0 days 06:38:06
3     B   Bob    Cue 0 days 02:00:32