I have a base dataframe (df_base) which contains my records and a lookup dataframe (df_lookup) containing a lookup list. I would like to find the closest lat/log in df_lookup to the lat/long in df_base and add them as columns. I am able to do this but it is very slow. df_base has over 1 million rows and df_lookup is at about 10,000. I suspect there is a way to vectorize or write it more efficiently but I have not been able to do it yet. My running but slow code is as follows
from math import radians, sin, cos, asin, sqrt
def hav_distance(lat1, lon1, lat2, lon2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
# Radius of earth in kilometers is 6371
km = 6371* c
return km
def find_nearest(lat, lon,df_lookup):
distances = df_lookup.apply(
lambda row: hav_distance(lat, lon, row['lat'], row['lon']),axis=1)
return df_lookup.loc[distances.idxmin(), ['lat', 'lon']]
df_base[['lookup lat','lookup long']] = df_base.apply(lambda row:
find_nearest(row['Latitude'], row['Longitude'],df_lookup)
if row[['Latitude','Longitude']].notnull().all()
else np.nan, axis=1)
Example for df_base
Latitude , Longitude
37.75210734489673 , -122.49572485891302
37.75046506608679 , -122.50583612245225
37.75612411999306 , -122.50728172021206
37.75726922992242 , -122.50251213426036
37.75243837156798 , -122.50442682534892
37.7519789637837 , -122.50402178717827
37.750903349294404 , -122.50241414813944
37.75602225181627 , -122.50060819272488
37.757921529607835 , -122.50036152209083
37.75628955086523 , -122.50694962686946
37.7573215112949 , -122.50224043772997
37.75074935869865 , -122.50127064328588
37.7528943256246 , -122.501056716164
37.754832309416386 , -122.50268274843049
37.757352142065265 , -122.50390638094865
37.75055972208169 , -122.50381787073599
37.753482040181844 , -122.49795018201644
37.7578160107123 , -122.50013574926646
37.749592580038346 , -122.50730545397994
37.7514871501036 , -122.49702703770673
Example for df_lookup
lat. , long
37.751 , -122.5
37.752 , -122.5
37.753 , -122.5
37.754 , -122.5
37.755 , -122.5
37.756 , -122.5
37.757 , -122.5
37.758 , -122.5
37.759 , -122.5
CodePudding user response:
Here's a way to do it that vectorizes the calculations for one axis:
import pandas as pd
import numpy as np
import random
from math import radians
def hav_distance(lat1, lon1, lat2, lon2):
#Calculate the great circle distance between two points on the earth (specified in radians)
# haversine formula
# Radius of earth in kilometers is 6371
a = np.sin((lat2 - lat1) / 2.0)**2 (np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0)**2)
distance = 6371 * 2 * np.arcsin(np.sqrt(a))
return distance
df_lookup[['rad_lat', 'rad_lon']] = df_lookup[['lat', 'lon']].applymap(radians)
dfb = df_base[['Latitude', 'Longitude']].applymap(radians).rename(columns={'Latitude':'lat', 'Longitude':'lon'})
'''
since the max value of np.arcsin is np.pi / 2.0, the following initialization will be
larger than any actual haversine distance.
'''
dfb['closest'] = 6371*2*np.pi / 2.0 1
dfb['cur'] = np.nan
for iLookup in range(len(df_lookup)):
dfb.cur = hav_distance(dfb.lat, dfb.lon, df_lookup.rad_lat[iLookup], df_lookup.rad_lon[iLookup])
mask = dfb.cur <= dfb.closest
dfb.loc[mask, 'closest_lookup_idx'] = iLookup
dfb.loc[mask, 'closest'] = dfb.cur
df_base.loc[df_base[['Latitude','Longitude']].notnull().all(axis=1),['lookup lat', 'lookup long']] = df_lookup.loc[dfb.closest_lookup_idx.dropna(),['lat', 'lon']].to_numpy()
Sample input:
nLookup = 1000
df_lookup = pd.DataFrame({
'lat':[random.random() * 360 - 180 for _ in range(nLookup)],
'lon':[random.random() * 360 - 180 for _ in range(nLookup)]})
df_base = pd.DataFrame({
'Latitude':[random.random() * 360 - 180 for _ in range(1000000)],
'Longitude':[random.random() * 360 - 180 for _ in range(1000000)]})
Sample Output:
Latitude Longitude lookup lat lookup long
0 -48.431374 -71.365602 -133.999431 100.602829
1 -83.701661 -110.421301 -84.789111 -99.283931
2 -112.419335 71.657476 -115.302104 74.354059
3 37.795702 -8.276904 35.171462 -5.945394
4 -18.813762 -32.348731 -22.496525 -24.347056
... ... ... ... ...
999995 139.056890 176.107864 138.961768 171.886359
999996 95.055924 -151.949260 84.079408 37.314496
999997 5.627785 -80.442339 4.956186 -80.219833
999998 -119.259137 51.234688 -64.474977 -124.594518
999999 -115.979939 -130.121593 -61.980270 55.774668
[1000000 rows x 4 columns]
For df_lookup
with 1,000 rows, it takes 1-2 minutes in my environment, so at 10,000 rows it will take 10-20 minutes.