i have a large dataframe containing the x,y,z coordinates of a surface. i am looking to find the pair of rows with the largest slope between them (dz/sqrt(dx^2 dy^2))
maxGrad = 0
currentGrad = 0
height = 0
for i in range(len(df)):
for j in range(i 1,len(df)):
height = abs(df.z.iloc[j]-df.z.iloc[i])
distance = math.sqrt((df.x.iloc[j]-df.x.iloc[i])**2 (df.y.iloc[j]-df.y.iloc[i])**2)
currentGrad = height/distance
if currentGrad > maxGrad:
maxGrad = currentGrad
maxCoorPair = [df.x.iloc[i],df.y.iloc[i],df.x.iloc[j],df.y.iloc[j]]
print(maxGrad, maxCoorPair)
However this is not very elegant and the run time is very long due to the nested for loop.
How can i do it better?
CodePudding user response:
example data:
df = pd.DataFrame(
{
"x":[1,5,3],
"y":[12,10,13],
"z":[4,4,1],
}
)
let's define a function which takes a vector of length n, and returns a n x n numpy array where the element in row i and column j is given by the difference of the ith and jth element in the vector
def d(vector):
return vector.apply(lambda x: x-vector).values
For example
d(df["x"])
gives
array([[ 0, -4, -2],
[ 4, 0, 2],
[ 2, -2, 0]], dtype=int64)
Plug this method into your formula like so, to perform calcs for all pairs of coordinates at once
slopes = d(df["z"])/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
slopes
looks like this
array([[ nan, 0. , 1.34164079],
[ 0. , nan, 0.83205029],
[-1.34164079, -0.83205029, nan]])
Note that in the formula, the absolute value of dz is not used. This is deliberate in order to not end up with answers of the form "(a,b) and (b,a)".
We can then use numpy.nanmax
which calculates the maximum of an array, ignoring nan
values, and use numpy.where
to pull out the rows and columns which match the maximum found
cols, rows = np.where(slopes==np.nanmax(slopes))
We can then zip these up to get tuples of coordinates
list(zip(rows, cols))
which gives us [(0, 2)]
So the largest slope is between the coordinates in rows 0 and 2
CodePudding user response:
As often it depends of the size of the problem.
Lets do some investigations
Conclusions:
Numpy arrays are faster than even if no Numpy function is used. .
The use of numpy functions still accelerates but at the expense of:
- some initialization time
- a loss of precision which leads to warnings
Investigations
init
from collections import namedtuple
import pandas as pd
import numpy as np
import math
Run = namedtuple('run', ['order', 'npa', 'df'])
def generate(maxorder):
run_list = []
for n in range(1, maxorder 1):
order = n
npa = np.random.random(size=(10**n, 3))
run_list.append(Run(order, npa, pd.DataFrame(npa, columns=list('xyz'))))
return run_list
runs = generate(5)
tools
from functools import wraps
import time
def timeit(func):
@wraps(func)
def timed(*args, **kw):
start_time = time.time()
result = func(*args, **kw)
end_time = time.time()
print(f"{func.__name__} {(end_time - start_time) * 1000} ms")
return result
return timed
def prints(f, data, max_order):
for order,npa,df in runs[:max_order]:
print()
print("Order:", order)
maxGrad, maxCoorPair = f(df if data == 'df' else npa)
print("maxGrad:", maxGrad)
print("Pair:")
print(maxCoorPair)
Solutions
original solution
@timeit
def sol_original(df):
maxGrad = 0
currentGrad = 0
height = 0
for i in range(len(df)):
for j in range(i 1,len(df)):
height = abs(df.z.iloc[j]-df.z.iloc[i])
distance = math.sqrt((df.x.iloc[j]-df.x.iloc[i])**2 (df.y.iloc[j]-df.y.iloc[i])**2)
currentGrad = height/distance
if currentGrad > maxGrad:
maxGrad = currentGrad
maxCoorPair = [(df.x.iloc[i],df.y.iloc[i]),(df.x.iloc[j],df.y.iloc[j])]
return maxGrad, maxCoorPair
prints(sol_original, 'df', 3)
Order: 1
sol_original 2.979278564453125 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]
Order: 2
sol_original 264.29247856140137 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]
Order: 3
sol_original 24213.2625579834 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]
solution numpy1.0 : substitute np array to df
@timeit
def sol_numpy10(npa):
maxGrad = 0
currentGrad = 0
height = 0
for i in range(len(npa)):
for j in range(i 1,len(npa)):
height = abs(npa[j][2]-npa[i][2])
distance = math.sqrt((npa[j][0]-npa[i][0])**2 (npa[j][1]-npa[i][1])**2)
currentGrad = height/distance
if currentGrad > maxGrad:
maxGrad = currentGrad
maxCoorPair = [(npa[i][0],npa[i][1]),(npa[j][0],npa[j][1])]
return maxGrad, maxCoorPair
prints(sol_numpy10, 'np', 3)
Order: 1
sol_numpy10 0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]
Order: 2
sol_numpy10 13.963699340820312 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]
Order: 3
sol_numpy10 998.3282089233398 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]
solution numpy1.1: idem numpy1.0 and optimize i coordinates
@timeit
def sol_numpy11(npa):
maxGrad = 0
currentGrad = 0
height = 0
for i in range(len(npa)):
xi,yi,zi = npa[i]
for j in range(i 1,len(npa)):
height = abs(npa[j][2]-zi)
distance = math.sqrt((npa[j][0]-xi)**2 (npa[j][1]-yi)**2)
currentGrad = height/distance
if currentGrad > maxGrad:
maxGrad = currentGrad
maxCoorPair = [(xi,yi),(npa[j][0],npa[j][1])]
return(maxGrad, maxCoorPair)
prints(sol_numpy11, 'np', 3)
Order: 1
sol_numpy11 0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]
Order: 2
sol_numpy11 10.002613067626953 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]
Order: 3
sol_numpy11 771.9049453735352 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]
solution numpy1.2; idem numpy1.1 and optimize j coordinates
@timeit
def sol_numpy12(npa):
maxGrad = 0
currentGrad = 0
height = 0
for i in range(len(npa)):
xi,yi,zi = npa[i]
for j in range(i 1,len(npa)):
xj,yj,zj = npa[j]
height = abs(zj-zi)
distance = math.sqrt((xj-xi)**2 (yj-yi)**2)
currentGrad = height/distance
if currentGrad > maxGrad:
maxGrad = currentGrad
maxCoorPair = [(xi,yi),(xj,yj)]
return(maxGrad, maxCoorPair)
prints(sol_numpy12, 'np', 3)
Order: 1
sol_numpy12 0.0 ms
maxGrad: 4.274082602280762
Pair:
[(0.08217955028694601, 0.9160537098844143), (0.2396284679279188, 0.8196073645585937)]
Order: 2
sol_numpy12 8.97526741027832 ms
maxGrad: 167.5282986999116
Pair:
[(0.09926115331767238, 0.7497707080285022), (0.09615387652529517, 0.7469231082687531)]
Order: 3
sol_numpy12 1075.1206874847412 ms
maxGrad: 1246.4073209631038
Pair:
[(0.8285207768494603, 0.016839864860434428), (0.8285102753052541, 0.016228726039262287)]
solution numpy2.0
Riley's answer
def d(vector):
return vector.apply(lambda x: x-vector).values
@timeit
def sol_numpy20(df):
slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
maxGrad = np.nanmax(slopes)
ind1, ind2 = np.where(slopes==maxGrad)
maxCoorPair = [df.iloc[ind2]]
return maxGrad, maxCoorPair
prints(sol_numpy20, 'df', 4)
Order: 1
10
sol_numpy20 6.981372833251953 ms
maxGrad: 4.274082602280762
Pair:
[ x y z
6 0.239628 0.819607 0.867972
5 0.082180 0.916054 0.078804]
Order: 2
100
sol_numpy20 42.88601875305176 ms
maxGrad: 167.5282986999116
Pair:
[ x y z
95 0.096154 0.746923 0.135159
45 0.099261 0.749771 0.841247]
Order: 3
1000
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
sol_numpy20 625.3554821014404 ms
maxGrad: 1246.4073209631038
Pair:
[ x y z
991 0.828510 0.016229 0.893811
240 0.828521 0.016840 0.131971]
Order: 4
10000
<ipython-input-201-ce6f8e091e1b>:7: RuntimeWarning: invalid value encountered in true_divide
slopes = np.abs(d(df["z"]))/np.sqrt(np.square(d(df["x"])) np.square(d(df["y"])))
sol_numpy20 12746.904850006104 ms
maxGrad: 4346.72025021119
Pair:
[ x y z
3751 0.538723 0.168160 0.131924
1063 0.538800 0.168028 0.798256]