Adding columns for the closest lat/long in a reference list to existing lat/long-CodePudding

I have a base dataframe (df_base) which contains my records and a lookup dataframe (df_lookup) containing a lookup list. I would like to find the closest lat/log in df_lookup to the lat/long in df_base and add them as columns. I am able to do this but it is very slow. df_base has over 1 million rows and df_lookup is at about 10,000. I suspect there is a way to vectorize or write it more efficiently but I have not been able to do it yet. My running but slow code is as follows

from math import radians, sin, cos, asin, sqrt
def hav_distance(lat1, lon1, lat2, lon2):
        """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        # haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat/2)**2   cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a))
        # Radius of earth in kilometers is 6371
        km = 6371* c
        return km

def find_nearest(lat, lon,df_lookup):
        distances = df_lookup.apply(
                     lambda row: hav_distance(lat, lon, row['lat'], row['lon']),axis=1)
        return df_lookup.loc[distances.idxmin(), ['lat', 'lon']]    
 
df_base[['lookup lat','lookup long']] = df_base.apply(lambda row: 
                           find_nearest(row['Latitude'], row['Longitude'],df_lookup)
                           if row[['Latitude','Longitude']].notnull().all()
                           else np.nan, axis=1)

Example for df_base

Latitude          , Longitude 
37.75210734489673 , -122.49572485891302
37.75046506608679 , -122.50583612245225
37.75612411999306 , -122.50728172021206
37.75726922992242 , -122.50251213426036
37.75243837156798 , -122.50442682534892
37.7519789637837 , -122.50402178717827
37.750903349294404 , -122.50241414813944
37.75602225181627 , -122.50060819272488
37.757921529607835 , -122.50036152209083
37.75628955086523 , -122.50694962686946
37.7573215112949 , -122.50224043772997
37.75074935869865 , -122.50127064328588
37.7528943256246 , -122.501056716164
37.754832309416386 , -122.50268274843049
37.757352142065265 , -122.50390638094865
37.75055972208169 , -122.50381787073599
37.753482040181844 , -122.49795018201644
37.7578160107123 , -122.50013574926646
37.749592580038346 , -122.50730545397994
37.7514871501036 , -122.49702703770673

Example for df_lookup

lat.   , long 
37.751 , -122.5
37.752 , -122.5
37.753 , -122.5
37.754 , -122.5
37.755 , -122.5
37.756 , -122.5
37.757 , -122.5
37.758 , -122.5
37.759 , -122.5

CodePudding user response：

Here's a way to do it that vectorizes the calculations for one axis:

import pandas as pd
import numpy as np
import random
from math import radians

def hav_distance(lat1, lon1, lat2, lon2):
    #Calculate the great circle distance between two points on the earth (specified in radians)
    # haversine formula
    # Radius of earth in kilometers is 6371
    a = np.sin((lat2 - lat1) / 2.0)**2   (np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2.0)**2)                 
    distance = 6371 * 2 * np.arcsin(np.sqrt(a))
    return distance

df_lookup[['rad_lat', 'rad_lon']] = df_lookup[['lat', 'lon']].applymap(radians)
dfb = df_base[['Latitude', 'Longitude']].applymap(radians).rename(columns={'Latitude':'lat', 'Longitude':'lon'})
'''
since the max value of np.arcsin is np.pi / 2.0, the following initialization will be 
larger than any actual haversine distance.
'''
dfb['closest'] = 6371*2*np.pi / 2.0   1
dfb['cur'] = np.nan
for iLookup in range(len(df_lookup)):
    dfb.cur = hav_distance(dfb.lat, dfb.lon, df_lookup.rad_lat[iLookup], df_lookup.rad_lon[iLookup])
    mask = dfb.cur <= dfb.closest
    dfb.loc[mask, 'closest_lookup_idx'] = iLookup
    dfb.loc[mask, 'closest'] = dfb.cur
df_base.loc[df_base[['Latitude','Longitude']].notnull().all(axis=1),['lookup lat', 'lookup long']] = df_lookup.loc[dfb.closest_lookup_idx.dropna(),['lat', 'lon']].to_numpy()

Sample input:

nLookup = 1000
df_lookup = pd.DataFrame({
'lat':[random.random() * 360 - 180 for _ in range(nLookup)],
'lon':[random.random() * 360 - 180 for _ in range(nLookup)]})
df_base = pd.DataFrame({
'Latitude':[random.random() * 360 - 180 for _ in range(1000000)],
'Longitude':[random.random() * 360 - 180 for _ in range(1000000)]})

Sample Output:

          Latitude   Longitude  lookup lat  lookup long
0       -48.431374  -71.365602 -133.999431   100.602829
1       -83.701661 -110.421301  -84.789111   -99.283931
2      -112.419335   71.657476 -115.302104    74.354059
3        37.795702   -8.276904   35.171462    -5.945394
4       -18.813762  -32.348731  -22.496525   -24.347056
...            ...         ...         ...          ...
999995  139.056890  176.107864  138.961768   171.886359
999996   95.055924 -151.949260   84.079408    37.314496
999997    5.627785  -80.442339    4.956186   -80.219833
999998 -119.259137   51.234688  -64.474977  -124.594518
999999 -115.979939 -130.121593  -61.980270    55.774668

[1000000 rows x 4 columns]

For df_lookup with 1,000 rows, it takes 1-2 minutes in my environment, so at 10,000 rows it will take 10-20 minutes.