Here's my script, it takes to much time to give output
from math import radians, cos, sin, asin, sqrt
def dist(lat1, long1, lat2, long2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
# haversine formula
dlon = long2 - long1
dlat = lat2 - lat1
a = sin(dlat/2)**2 cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
# Radius of earth in kilometers is 6371
km = 6371* c
return km
def find_nearest(lat, long):
distances = bts.apply(
lambda row: dist(lat, long, row['lat'], row['lon']),
axis=1)
return bts.loc[distances.idxmin(), 'sitename']
airport_hospital_1['sitename'] = airport_hospital_1.apply(
lambda row: find_nearest(row['lat'], row['lon']),
axis=1)
Is there any way to make this code faster
CodePudding user response:
To be faster, you should use compiled code.
pyproj
allows to calculate a distance between 2 points: (Geod.line_length
)
>>> import pyproj
>>> g = pyproj.Geod(ellps="WGS84")
>>> g.line_length([0, 1], [45, 46])
135869.0912468657
def nearest(pt, pts):
g = pyproj.Geod(ellps="WGS84")
return min(pts, key=lambda x: g.line_length((pt[0], x[0]), (pt[1], x[1])))
pt = nearest((5, 15), ((6, 14), (7, 15), (8, 16)))
print(pt)
Give (6, 14)
CodePudding user response:
You can use BallTree
:
Full example:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree, DistanceMetric
# bts
N = 25000
df1 = pd.DataFrame({'sitename': 'site' pd.RangeIndex(1, N 1).astype(str),
'lat': np.random.uniform(30, 65, N),
'lon': np.random.uniform(-150, -70, N)})
# airport_hospital_1
N = 2000
df2 = pd.DataFrame({'name': 'name' pd.RangeIndex(1, N 1).astype(str),
'lat': np.random.uniform(30, 65, N),
'lon': np.random.uniform(-150, -70, N)})
# bts
coords = np.radians(df1[['lat', 'lon']])
dist = DistanceMetric.get_metric('haversine')
tree = BallTree(coords, metric=dist)
# airport_hostpital_1
coords = np.radians(df2[['lat', 'lon']])
distances, indices = tree.query(coords, k=1)
df2['sitename'] = df1.iloc[indices.ravel()]['sitename'].values
Output:
name lat lon sitename
0 name1 32.109678 -129.168059 site15027
1 name2 54.597830 -72.638636 site1745
2 name3 30.280831 -132.000681 site8412
3 name4 35.490756 -128.500015 site22097
4 name5 55.455771 -83.004763 site21841
... ... ... ... ...
1995 name1996 56.375947 -95.854305 site12061
1996 name1997 52.222642 -80.091020 site2779
1997 name1998 41.023596 -135.235087 site191
1998 name1999 45.358313 -137.822065 site18762
1999 name2000 31.712401 -131.267527 site2479
[2000 rows x 4 columns]
Performance
>>> %timeit tree.query(coords, k=1)
18.3 ms ± 64.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)