I want to check a python panda performance
col = [
"age",
"workclass",
"fnlwgt",
"education",
"educationNum",
"maritalStatus",
"occupation",
"relationship",
"race",
"sex",
"capitalGain",
"capitalLoss",
"hoursPerWeek",
"nativeCountry",
"Above50K"
]
import pandas as pd
df = pd.read_csv("adult.data", header=None)
df.columns = col
Query 1:
df[(df.education == " HS-grad") & (df.sex == " Female") & (df.age <25)]
Query 2 :
df.loc[(df.education == " HS-grad") & (df.sex == " Female") & (df.age <25), :]
Query 1 vs Query 2 ; which is better performance wise
CodePudding user response:
Code of above plot and benchmark:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import time
bench = []
for num_row in np.power(np.arange(10,1500,5),2):
df = pd.DataFrame({
'age':np.random.randint(0,100,num_row),
'sex': np.random.choice(['Female','Male'],num_row),
'education': np.random.choice(['BS','MS','HS'],num_row),
})
start = time.time()
df[(df.education == "HS") & (df.sex == "Female") & (df.age <25)]
bench.append({'Num_Rows':num_row, 'Method':'with_out_loc', 'Time':time.time() - start})
start = time.time()
df.loc[(df.education == "HS") & (df.sex == "Female") & (df.age <25), :]
bench.append({'Num_Rows':num_row, 'Method':'with_loc', 'Time':time.time() - start})
start = time.time()
df.query('(education=="HS") & (sex=="Female") & (age < 25)')
bench.append({'Num_Rows':num_row, 'Method':'with_query', 'Time':time.time() - start})
plt.subplots(1,1, figsize=(10,7))
df = pd.DataFrame(bench)
sns.lineplot(data=df, x="Num_Rows", y="Time", hue="Method", style="Method")
plt.show()