Dataframe: create pairwise difference-CodePudding

I have this dataframe:

Name	Score 1	Score 2	Score 3	...	Score 20
Max	2	3	5	...	2
Jacob	1	4	3	...	6

How do I get the pairwise difference? So the result should look like:

Name	Score 1-2	Score 1-3	Score 1-20	...	Score 2-3	Score 2-4	...	Score 19-20
Max	-1	-3	3	...	-2	2	...	3
Jacob	-3	-2	2	...	-1	-2	...	2

I created an empty Dataframe and tried to fill in via for-loops, but this takes very long as I have a few 100k names.

The final Dataframe should have 191 columns:

liste =["Name"]

for i in range (1,21):
    for j in range (i 1, 21):
        liste.append("Score " str(i) " to Score " str(j))
df = pd.DataFrame(columns=liste)

CodePudding user response：

Here's one way to use a loop over the second dimension and iteratively use rsub to subtract the remaining columns from the "current" column. Then after the loop, concatenate all DataFrames:

out = []
for i in range(1, df.shape[1]):
    tmp = df.iloc[:, i 1:].rsub(df.iloc[:, i], axis=0)
    tmp.columns = [f'Score {i}-{num}' for num in tmp.columns.str.split().str[1]]
    out.append(tmp)
out = pd.concat(out, axis=1)

Output:

   Score 1-2  Score 1-3  Score 1-20  Score 2-3  Score 2-20  Score 3-20
0         -1         -3           0         -2           1           3
1         -3         -2          -5          1          -2          -3

CodePudding user response：

Use numpy broadcasting to calculate the difference between the columns:

# Create some sample data for our tests
import string
from itertools import permutations

n_names = 100_000
all_names = [''.join(p) for p in permutations(list(string.ascii_uppercase), 4)]

names = np.random.choice(all_names, n_names, replace=False)
scores = np.random.randint(1, 100, (20, n_names))
df = pd.DataFrame({
    "Name": names,
    **{
        f"Score {i 1}": scores[i] for i in range(20)
    }
})

# ----------------------------------------------
# The actual code
# Convert the scores into a numpy array
scores = df.loc[:, "Score 1":"Score 20"].to_numpy().T

# Take advantage of numpy broadcast to subtract every score against every other
# score, including itself. `diff` is a (20, 20, n_names) array
diff = scores[:, None] - scores

# We don't need every difference, only Score i - j where i < j
result = pd.DataFrame({
    "Name": df["Name"],
    **{
        f"Score {i 1} - {j 1}": diff[i, j]
        for i in range(20) for j in range(20) if i < j
    }
})

CodePudding user response：

I think using numpy's automatic broadcasting is the fastest

df = pd.DataFrame({'Name':['A', 'B'], 'Score1':[2,1], 'Score2':[3,4], 'Score3':[5,6], 'Score4':[7, 8]})
n_col = df.shape[1] - 1
arr = df.iloc[:, 1:].values
names_list = ['Score {}-{}'.format(i, j) for i in range(1,1 n_col) for j in range(1, 1 n_col) if i>j]
idx_list = [i j*n_col for i in range(n_col) for j in range(n_col) if i>j]

# actual code
pd.DataFrame((arr[:, None, :] - arr[:, :, None]).reshape(df.shape[0], -1)[:, idx_list], columns = names_list)

    Score 2-1   Score 3-1   Score 3-2   Score 4-1   Score 4-2   Score 4-3
0   1           3           2           5           4           2
1   3           5           2           7           4           2

Did a comparison of the methods so far on 10k data and 20 columns:

import numpy as np
import pandas as pd
from itertools import combinations

def f1(df):
    # @Z Li
    n_col = df.shape[1] - 1
    arr = df.iloc[:, 1:].values
    names_list = ['Score {} - {}'.format(i, j) for i in range(1,1 n_col) for j in range(1, 1 n_col) if i>j]
    idx_list = [i j*n_col for i in range(n_col) for j in range(n_col) if i>j]
    df_new = pd.DataFrame((arr[:, None, :] - arr[:, :, None]).reshape(df.shape[0], -1)[:, idx_list], columns = names_list)
    df_new['Name'] = df['Name']
    return df_new

def f2(df):
    # @Code Different
    scores = df.loc[:, "Score 1":"Score 20"].to_numpy().T

    # Take advantage of numpy broadcast to subtract every score against every other
    # score, including itself. `diff` is a (20, 20, n_names) array
    diff = scores[:, None] - scores

    # We don't need every difference, only Score i - j where i < j
    result = pd.DataFrame({
        "Name": df["Name"],
        **{
            f"Score {i 1} - {j 1}": diff[i, j]
            for i in range(20) for j in range(20) if i < j
        }
    })
    return result

def f3(df):
    # @enke
    out = []
    for i in range(1, df.shape[1]):
        tmp = df.iloc[:, i 1:].rsub(df.iloc[:, i], axis=0)
        tmp.columns = [f'Score {i}-{num}' for num in tmp.columns.str.split().str[1]]
        out.append(tmp)
    out = pd.concat(out, axis=1)
    return out

def f4(df):
    # @Cameron Riddell
    df = df.set_index('Name')
    column_pairs = pd.MultiIndex.from_tuples(combinations(df.columns, r=2))
    pairwise_differences = (
        df.reindex(columns=column_pairs, level=0) 
        - df.reindex(columns=column_pairs, level=1)
    )
    pairwise_differences.columns = [
    f'Score {col[0].split(" ")[1]}-{col[1].split(" ")[1]}'
    for col in pairwise_differences.columns]
    return pairwise_differences.reset_index()

df = pd.DataFrame({"Name": range(10000), **{
        f"Score {i}": i for i in range(1, 21)
    }})

%timeit f1(df)
# 14 ms ± 412 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit f2(df)
# 18.7 ms ± 417 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit f3(df)
# 24 ms ± 607 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit f4(df)
# 21.7 ms ± 568 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

It is interesting when data hits 100k x 20:

%timeit f1(df)
# 379 ms ± 9.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit f2(df)
# 383 ms ± 8.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit f3(df)
# 260 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit f4(df)
# 523 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

CodePudding user response：

You can perform the main bulk of these computations in a vectorized format like so:

import pandas as pd
from itertools import combinations

df = pd.DataFrame({
    "Name": ["Max", "Jacob"],
    "Score 1": [2, 1],
    "Score 2": [3, 4],
    "Score 3": [5, 3],
    "Score 4": [10, 9],
}).set_index("Name")
 
column_pairs = pd.MultiIndex.from_tuples(combinations(df.columns, r=2)) 
pairwise_differences = (
    df.reindex(columns=column_pairs, level=0) 
    - df.reindex(columns=column_pairs, level=1)
)

print(pairwise_differences)
      Score 1                 Score 2         Score 3
      Score 2 Score 3 Score 4 Score 3 Score 4 Score 4
Name                                                 
Max        -1      -3      -8      -2      -7      -5
Jacob      -3      -2      -8       1      -5      -6

From there it's all about cleaning up the column labels from a MultiIndex to a regular Index:

pairwise_differences.columns = [
    f'Score {col[0].removeprefix("Score ")}-{col[1].removeprefix("Score ")}'
    for col in pairwise_differences.columns
]

print(pairwise_differences)
       Score 1-2  Score 1-3  Score 1-4  Score 2-3  Score 2-4  Score 3-4
Name                                                                   
Max           -1         -3         -8         -2         -7         -5
Jacob         -3         -2         -8          1         -5         -6

Alternatively, you can iterate over combinations of columns with a split-apply-combine type of approach:

import pandas as pd
from itertools import combinations

df = pd.DataFrame({
    "Name": ["Max", "Jacob"],
    "Score 1": [2, 1],
    "Score 2": [3, 4],
    "Score 3": [5, 3],
    "Score 4": [10, 9],
}).set_index("Name")

results = {}
for left, right in combinations(df.columns, r=2):
    left_id = left.removeprefix("Score ")
    right_id = right.removeprefix("Score ")
    results[f"Score {left_id}-{right_id}"] = df[left] - df[right]

out = pd.concat(results, axis=1)
print(out)
       Score 1-2  Score 1-3  Score 1-4  Score 2-3  Score 2-4  Score 3-4
Name                                                                   
Max           -1         -3         -8         -2         -7         -5
Jacob         -3         -2         -8          1         -5         -6