But there is a lot of duplicate codes when I do that way in bottom.
what should I do to avoid duplicate and do it in a shorter way?
import numpy as np
import pandas as pd
train_purchases = pd.read_csv(r"C:\Users\Can\Desktop\dressipi_recsys2022\train_purchases.csv")
first_month = train_purchases.loc[(train_purchases['date'] > '2020-01-01') & (train_purchases['date'] <= '2020-01-31')].sort_values(by=["item_id"])["item_id"].tolist()
second_month = train_purchases.loc[(train_purchases['date'] > '2020-02-01') & (train_purchases['date'] <= '2020-02-31')].sort_values(by=["item_id"])["item_id"].tolist()
third_month = train_purchases.loc[(train_purchases['date'] > '2020-03-01') & (train_purchases['date'] <= '2020-03-31')].sort_values(by=["item_id"])["item_id"].tolist()
fourth_month = train_purchases.loc[(train_purchases['date'] > '2020-04-01') & (train_purchases['date'] <= '2020-04-31')].sort_values(by=["item_id"])["item_id"].tolist()
fifth_month = train_purchases.loc[(train_purchases['date'] > '2020-05-01') & (train_purchases['date'] <= '2020-05-31')].sort_values(by=["item_id"])["item_id"].tolist()
sixth_month = train_purchases.loc[(train_purchases['date'] > '2020-06-01') & (train_purchases['date'] <= '2020-06-31')].sort_values(by=["item_id"])["item_id"].tolist()
def most_frequent(List):
counter = 0
num = List[0]
for i in List:
curr_frequency = List.count(i)
if(curr_frequency> counter):
counter = curr_frequency
num = i
print(num," id sold", List.count(num), "times. ")
most_frequent(first_month)
most_frequent(second_month)
most_frequent(third_month)
most_frequent(fourth_month)
most_frequent(fifth_month)
most_frequent(sixth_month)
CodePudding user response:
you can use something like this:
start = '2020-01-01'
end = '2021-03-31'
first_day = pd.date_range(start, end, freq='MS').astype(str).to_list() #get first day of month given range
end_day = pd.date_range(start, end, freq='M').strftime("%Y-%m-%d 23:59:59").astype(str).to_list() #get last day of month of given date
dates = dict(zip(first_day, end_day)) #convert lists to dictionary
#dates={'2020-01-01':'2020-01-31'}....
train_purchases['date']=pd.to_datetime(train_purchases['date'])
for k,v in dates.items():
mask = train_purchases.loc[(train_purchases['date'] > k) & (train_purchases['date'] <= v)].sort_values(by=["item_id"]).item_id.value_counts()[:1]
print(mask.index[0]," id sold", mask.iloc[0], "times. ")
'''
8060 id sold 564 times.
8060 id sold 421 times.
8060 id sold 375 times.
8060 id sold 610 times.
8060 id sold 277 times.
8060 id sold 280 times.
8622 id sold 290 times.
8060 id sold 374 times.
8060 id sold 638 times.
8060 id sold 563 times.
8060 id sold 1580 times.
8060 id sold 765 times.
19882 id sold 717 times.
19882 id sold 570 times.
19882 id sold 690 times.
'''
Note
train_purchases['date'] > '2020-01-01'
If you use it as above, the first day of the month is not included in the calculation. If you want the first day of the month, you should use it as below
train_purchases['date'] >= '2020-01-01'