Hope you are doing well. Here is one problem needs your help. I am working with some data on pandas.
I want to get 2 points that have the closest distance to the value entered by the user.
For example, if I input d=4, I want to get the fastest way to output the result ((C:18 and F:14) or (B:3 and D:7)). The method I implemented is a method used by elementary school students, so I am ashamed to post it.
pandas or pyspark will be helpful for me. Thank you very much.
CodePudding user response:
We could try with merge
input = 4
out = df.assign(key = df['Value']-input).merge(df.assign(key=df['Value']),on='key')
Out[59]:
Name_x Value_x key Name_y Value_y
0 C 18 14 F 14
1 D 7 3 B 3
2 E 11 7 D 7
CodePudding user response:
It's a bit complicated so I built a class to hold all the methods. Each method should hopefully be self-explanatory Using heapq and itertools
import heapq
from itertools import combinations
import pandas as pd
class ClosestDistances:
"""
:arg data: pd.DataFrame
:arg user_selection: int
:arg points: int
:return list[tuple(dict, dict)]
"""
def __init__(self, **kwargs):
df = kwargs.get("data")
self.user_selection = kwargs.get("user_selection")
self.points = kwargs.get("points")
self.df_mapping = dict(zip(df["letter"], df["number"]))
def main(self) -> list:
possible_combinations = self.possible_combinations()
closest_points = self.nearest_difference(possible_combinations)
return self.map_nearest(closest_points)
def nearest_difference(self, combos: list) -> list:
return heapq.nsmallest(self.points, combos, lambda x: abs((x[0] - x[1]) - self.user_selection))
def possible_combinations(self) -> list:
return [sorted(x, reverse=True) for x in combinations(self.df_mapping.values(), self.points)]
def get_keys(self) -> dict:
return {v: k for k, v in self.df_mapping.items()}
def map_nearest(self, closest_points: list) -> list:
iterator = iter([{self.get_keys().get(x): x} for i in closest_points for x in i])
return list(zip(iterator, iterator))
data = pd.DataFrame({
"letter": ["A", "B", "C", "D", "E", "F", "G"],
"number": [12, 3, 18, 7, 11, 14, 5]
})
closest = ClosestDistances(data=data, user_selection=4, points=2).main()
print(closest)
[({'D': 7}, {'B': 3}), ({'C': 18}, {'F': 14})]