I would like to pass 'n' amount of cities to travel to and corresponding days in each city to a function that would return a df with all possible permutations of the journey. The kayak_search_url
column in the df should contain this string in the first row:
...but instead contains this string:
I can't figure out why the origin code 'AMS' shows up instead of the chain of cities. Here's the code:
# List the cities you want to travel to and from, how long you'd like to stay in each, and the appropriate start/end dates
start_city = 'Amsterdam'
end_city = 'Amsterdam'
start_date = '2023-02-14'
cities = ['Warsaw', 'Bogota', 'Milan', 'Santo Domingo']
days = [3,3,3,2]
def generate_permutations(cities, days, start_city, end_city, start_date):
city_to_days = dict(zip(cities, days))
permutations = list(itertools.permutations(cities))
df = pd.DataFrame(permutations, columns=['city' str(i) for i in range(1, len(cities) 1)])
df['origin'] = start_city
df['end'] = end_city
first_column = df.pop('origin')
df.insert(0, 'origin', first_column)
st_dt = pd.to_datetime(start_date)
df = df.assign(flight_dt_1=st_dt)
for i in range(len(cities)):
df['flight_dt_' str(i 2)] = df['flight_dt_' str(i 1)] df['city' str(i 1)].map(city_to_days).map(lambda x: pd.Timedelta(days=x))
# IATA city code dictionary from iata_code.csv file in repo and create Kayak 'url' column for each permutation
iata = {'Amsterdam': 'AMS',
'Warsaw': 'WAW',
'Bogota': 'BOG',
'Milan': 'MIL',
'Santo Domingo': 'SDQ'}
url = 'https://www.kayak.com/flights/'
df['kayak_search_url'] = df.apply(lambda x: url ''.join([iata[x['origin']] '-' iata[x['city' str(i 1)]] \
',nearby/' str(x['flight_dt_' str(i 1)].strftime("%Y-%m-%d")) '/' \
for i in range(len(cities))]) iata[x['end']] ',nearby/' str(x['flight_dt_' str(len(cities) 1)].strftime("%Y-%m-%d")) \
'/?sort=bestflight_a', axis=1)
return df
CodePudding user response:
Let's break down the desired URL to highlight its structure:
https://www.kayak.com/flights
/AMS-WAW,nearby/2023-02-14
/WAW-BOG,nearby/2023-02-17
/BOG-MIL,nearby/2023-02-20
/MIL-SDQ,nearby/2023-02-23
/SDQ-AMS,nearby/2023-02-25
/?sort=bestflight_a
Obviously only the middle section needs to generated as the other parts are static. We can also generate that middle section before constructing the dataframe:
def generate_permutations(cities, days, start_city, end_city, start_date):
iata = {
"Amsterdam": "AMS",
"Warsaw": "WAW",
"Bogota": "BOG",
"Milan": "MIL",
"Santo Domingo": "SDQ",
}
permutations = [
(start_city,) p (end_city,) for p in itertools.permutations(cities)
]
flight_dates = pd.to_datetime(start_date) pd.to_timedelta(
np.array([0] days).cumsum(),
unit="D",
)
# Generate the URLs
urls = []
for p in permutations:
# The pattern for each segment is
# START-END,nearby/yyyy-dd-dd
mid_url = "/".join(
[
f"{iata[s]}-{iata[e]},nearby/{fd:%Y-%m-%d}"
for s, e, fd in zip(p[:-1], p[1:], flight_dates)
]
)
urls.append(f"https://www.kayak.com/flights/{mid_url}/?sort=bestflight_a")
# Generate the resulting dataframe
return (
pd.DataFrame(
permutations,
columns=["origin", *[f"city{i 1}" for i in range(len(cities))], "end"],
)
.merge(
pd.DataFrame(
flight_dates,
index=[f"flight_dt_{i 1}" for i in range(len(flight_dates))],
).T,
how="cross",
)
.assign(kayak_search_url=urls)
)