Home > Blockchain >  find pair of rows with largest value difference in pandas dataframe
find pair of rows with largest value difference in pandas dataframe

Time:10-25

i have a large dataframe containing the x,y,z coordinates of a surface. i am looking to find the pair of rows with the largest slope between them (dz/sqrt(dx^2 dy^2))

maxGrad = 0
currentGrad = 0
height = 0

for i in range(len(df)):
  for j in range(i 1,len(df)):
      height = abs(df.z.iloc[j]-df.z.iloc[i])
      distance = math.sqrt((df.x.iloc[j]-df.x.iloc[i])**2 (df.y.iloc[j]-df.y.iloc[i])**2)
      currentGrad = height/distance
      if currentGrad > maxGrad:
          maxGrad = currentGrad
          maxCoorPair = [df.x.iloc[i],df.y.iloc[i],df.x.iloc[j],df.y.iloc[j]]
print(maxGrad, maxCoorPair)

However this is not very elegant and the run time is very long due to the nested for loop.

How can i do it better?

CodePudding user response:

example data:

df = pd.DataFrame(
    {
        "x":[1,5,3],
        "y":[12,10,13],
        "z":[4,4,1],
    }
)

let's define a function which takes a vector of length n, and returns a n x n numpy array where the element in row i and column j is given by the difference of the ith and jth element in the vector

def d(vector):
    return vector.apply(lambda x: x-vector).values

For example

d(df["x"])

gives

array([[ 0, -4, -2],
       [ 4,  0,  2],
       [ 2, -2,  0]], dtype=int64)

Plug this method into your formula like so, to perform calcs for all pairs of coordinates at once

slopes = d(df["z"])/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))

slopes looks like this

array([[        nan,  0.        ,  1.34164079],
       [ 0.        ,         nan,  0.83205029],
       [-1.34164079, -0.83205029,         nan]])

Note that in the formula, the absolute value of dz is not used. This is deliberate in order to not end up with answers of the form "(a,b) and (b,a)".

We can then use numpy.nanmax which calculates the maximum of an array, ignoring nan values, and use numpy.where to pull out the rows and columns which match the maximum found

cols, rows = np.where(slopes==np.nanmax(slopes))

We can then zip these up to get tuples of coordinates

list(zip(rows, cols))

which gives us [(0, 2)]

So the largest slope is between the coordinates in rows 0 and 2

CodePudding user response:

As often it depends of the size of the problem.

Lets do some investigations

Conclusions:

Numpy arrays are faster than even if no Numpy function is used. .

The use of numpy functions still accelerates but at the expense of:

  • some initialization time
  • a loss of precision which leads to warnings

Investigations

init

from collections import namedtuple
import pandas as pd
import numpy as np
import math
Run = namedtuple('run', ['order', 'npa', 'df'])
def generate(maxorder):
    run_list = []
    for n in range(1, maxorder   1):
        order = n
        npa = np.random.random(size=(10**n, 3))
        run_list.append(Run(order, npa, pd.DataFrame(npa, columns=list('xyz'))))
        
    return run_list                  
runs = generate(5)

tools

from functools import wraps
import time
def timeit(func):
    @wraps(func)
    def timed(*args, **kw):
        start_time = time.time()
        result = func(*args, **kw)
        end_time = time.time()
        print(f"{func.__name__}  {(end_time - start_time) * 1000} ms")
        return result
    return timed
def prints(f, data, max_order):
    for order,npa,df in runs[:max_order]:
        print()
        print("Order:", order)
        maxGrad, maxCoorPair = f(df if data == 'df' else npa)
        print("maxGrad:", maxGrad)
        print("Pair:")
        print(maxCoorPair)

Solutions

original solution

@timeit
def sol_original(df):
    maxGrad = 0
    currentGrad = 0
    height = 0
    
    for i in range(len(df)):
        for j in range(i 1,len(df)):
            height = abs(df.z.iloc[j]-df.z.iloc[i])
            distance = math.sqrt((df.x.iloc[j]-df.x.iloc[i])**2 (df.y.iloc[j]-df.y.iloc[i])**2)
            currentGrad = height/distance
            if currentGrad > maxGrad:
                maxGrad = currentGrad
                maxCoorPair = [(df.x.iloc[i],df.y.iloc[i]),(df.x.iloc[j],df.y.iloc[j])]
    return maxGrad, maxCoorPair
prints(sol_original, 'df', 3)
Order: 1
sol_original  2.979278564453125 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]

Order: 2
sol_original  264.29247856140137 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]

Order: 3
sol_original  24213.2625579834 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]

solution numpy1.0 : substitute np array to df

@timeit
def sol_numpy10(npa):
    maxGrad = 0
    currentGrad = 0
    height = 0
    
    for i in range(len(npa)):
        for j in range(i 1,len(npa)):
            height = abs(npa[j][2]-npa[i][2])
            distance = math.sqrt((npa[j][0]-npa[i][0])**2 (npa[j][1]-npa[i][1])**2)
            currentGrad = height/distance
            if currentGrad > maxGrad:
                maxGrad = currentGrad
                maxCoorPair = [(npa[i][0],npa[i][1]),(npa[j][0],npa[j][1])]
    return maxGrad, maxCoorPair
prints(sol_numpy10, 'np', 3)
Order: 1
sol_numpy10  0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]

Order: 2
sol_numpy10  13.963699340820312 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]

Order: 3
sol_numpy10  998.3282089233398 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]

solution numpy1.1: idem numpy1.0 and optimize i coordinates

@timeit
def sol_numpy11(npa):
    maxGrad = 0
    currentGrad = 0
    height = 0
    
    for i in range(len(npa)):
        xi,yi,zi = npa[i]
        for j in range(i 1,len(npa)):
            height = abs(npa[j][2]-zi)
            distance = math.sqrt((npa[j][0]-xi)**2 (npa[j][1]-yi)**2)
            currentGrad = height/distance
            if currentGrad > maxGrad:
                maxGrad = currentGrad
                maxCoorPair = [(xi,yi),(npa[j][0],npa[j][1])]
    return(maxGrad, maxCoorPair)
prints(sol_numpy11, 'np', 3)
Order: 1
sol_numpy11  0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]

Order: 2
sol_numpy11  10.002613067626953 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]

Order: 3
sol_numpy11  771.9049453735352 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]

solution numpy1.2; idem numpy1.1 and optimize j coordinates

@timeit
def sol_numpy12(npa):
    maxGrad = 0
    currentGrad = 0
    height = 0
    
    for i in range(len(npa)):
        xi,yi,zi = npa[i]
        for j in range(i 1,len(npa)):
            xj,yj,zj = npa[j]
            height = abs(zj-zi)
            distance = math.sqrt((xj-xi)**2 (yj-yi)**2)
            currentGrad = height/distance
            if currentGrad > maxGrad:
                maxGrad = currentGrad
                maxCoorPair = [(xi,yi),(xj,yj)]
    return(maxGrad, maxCoorPair)
prints(sol_numpy12, 'np', 3)
Order: 1
sol_numpy12  0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]

Order: 2
sol_numpy12  8.97526741027832 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]

Order: 3
sol_numpy12  1075.1206874847412 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]

solution numpy2.0

Riley's answer

def d(vector):
    return vector.apply(lambda x: x-vector).values

@timeit
def sol_numpy20(df):
    slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))
    maxGrad = np.nanmax(slopes)
    ind1, ind2 = np.where(slopes==maxGrad)
    maxCoorPair = [df.iloc[ind2]]
    return maxGrad, maxCoorPair
prints(sol_numpy20, 'df', 4)
Order: 1
10
sol_numpy20  6.981372833251953 ms
maxGrad: 4.274082602280762
Pair:
[          x         y         z
6  0.239628  0.819607  0.867972
5  0.082180  0.916054  0.078804]

Order: 2
100
sol_numpy20  42.88601875305176 ms
maxGrad: 167.5282986999116
Pair:
[           x         y         z
95  0.096154  0.746923  0.135159
45  0.099261  0.749771  0.841247]

Order: 3
1000


<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
  slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
  slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
  slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))


sol_numpy20  625.3554821014404 ms
maxGrad: 1246.4073209631038
Pair:
[            x         y         z
991  0.828510  0.016229  0.893811
240  0.828521  0.016840  0.131971]

Order: 4
10000


<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
  slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"]))   np.square(d(df["y"])))


sol_numpy20  12746.904850006104 ms
maxGrad: 4346.72025021119
Pair:
[             x         y         z
3751  0.538723  0.168160  0.131924
1063  0.538800  0.168028  0.798256]

  • Related