I have created a loop which does the job but is very inefficient in execution.
Example Dataset:
import pandas as pd
df=pd.DataFrame([[2801,0],[2801,0],[2801,0],[2801,0],[2801,3],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,2],[2801,0],[2801,0],[2801,0],[2801,0],[2801,1],[2801,0],[2801,0],[2801,0],[2869,-85],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,5],[2869,0],[2869,0],[2869,1],[2869,0],[2869,20],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,1],[2869,0],[2869,12],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,13],[2869,0],[2869,1],[2869,0],[2869,12],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[3048,-75],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,0],[3048,1]],
columns=['id','day_delta'])
df
id day_delta
.
.
.
5 2801 0
6 2801 0
7 2801 0
8 2801 0
9 2801 0
10 2801 0
11 2801 0
12 2801 0
13 2801 0
14 2801 0
15 2801 2
16 2801 0
17 2801 0
18 2801 0
19 2801 0
20 2801 1
21 2801 0
22 2801 0
23 2801 0
24 2869 -85
25 2869 0
26 2869 0
27 2869 1
28 2869 0
29 2869 0
30 2869 0
31 2869 1
32 2869 0
33 2869 0
.
.
.
The code I applied for getting custom workweek:
df['workweek']=0
for i in range(len(df)):
if i==0:
df.loc[i,'workweek']=1
else:
if df["id"][i]==df["id"][i-1]:
if df["day_delta"][i-1]>1:
df['workweek'][i]=df['workweek'][i-1] 1
else:
df['workweek'][i]=df['workweek'][i-1]
else:
df['workweek'][i]=1
Result_1 :
id day_delta workweek
.
.
.
5 2801 0 2
6 2801 0 2
7 2801 0 2
8 2801 0 2
9 2801 0 2
10 2801 0 2
11 2801 0 2
12 2801 0 2
13 2801 0 2
14 2801 0 2
15 2801 2 2
16 2801 0 3
17 2801 0 3
18 2801 0 3
19 2801 0 3
20 2801 1 3
21 2801 0 3
22 2801 0 3
23 2801 0 3
24 2869 -85 1
25 2869 0 1
26 2869 0 1
27 2869 1 1
28 2869 0 1
29 2869 0 1
30 2869 0 1
31 2869 1 1
32 2869 0 1
33 2869 0 1
.
.
.
Code I applied for creating week sequence:
df['week_seq']=0
for i in range(len(df)):
if i==0:
df.loc[i,'week_seq']=1
else:
if df["workweek"][i]!=df["workweek"][i-1]:
df['week_seq'][i]=1
else:
df['week_seq'][i]=df['week_seq'][i-1] 1
Result_2:
id day_delta workweek week_seq
.
.
.
5 2801 0 2 1
6 2801 0 2 2
7 2801 0 2 3
8 2801 0 2 4
9 2801 0 2 5
10 2801 0 2 6
11 2801 0 2 7
12 2801 0 2 8
13 2801 0 2 9
14 2801 0 2 10
15 2801 2 2 11
16 2801 0 3 1
17 2801 0 3 2
18 2801 0 3 3
19 2801 0 3 4
20 2801 1 3 5
21 2801 0 3 6
22 2801 0 3 7
23 2801 0 3 8
24 2869 -85 1 1
25 2869 0 1 2
26 2869 0 1 3
27 2869 1 1 4
28 2869 0 1 5
29 2869 0 1 6
30 2869 0 1 7
31 2869 1 1 8
32 2869 0 1 9
33 2869 0 1 10
.
.
.
I am getting the correct result in column 'workweek' and 'week_seq', but the process is highly inefficient over large dataset.
Is there a better way of accomplishing this? Maybe making a function and using apply method to it?
something in this format maybe?:
df['col_3'] = df.apply(lambda x: f(x.col_1, x.col_2, x.col_3), axis=1)
I am not sure, how to apply this?
CodePudding user response:
import pandas as pd
import time
data = [[2801,0],[2801,0],[2801,0],[2801,0],[2801,3],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,0],[2801,2],[2801,0],[2801,0],[2801,0],[2801,0],[2801,1],[2801,0],[2801,0],[2801,0],[2869,-85],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,5],[2869,0],[2869,0],[2869,1],[2869,0],[2869,20],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,1],[2869,0],[2869,12],[2869,0],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[2869,13],[2869,0],[2869,1],[2869,0],[2869,12],[2869,0],[2869,1],[2869,0],[2869,0],[2869,0],[2869,0],[2869,0],[3048,-75],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,1],[3048,0],[3048,0],[3048,0],[3048,0],[3048,1]]
# reference approach
s = time.time()
df=pd.DataFrame(data, columns=['id','day_delta'])
df['workweek']=0
for i in range(len(df)):
if i==0:
df.loc[i,'workweek']=1
else:
if df["id"][i]==df["id"][i-1]:
if df["day_delta"][i-1]>1:
df['workweek'][i]=df['workweek'][i-1] 1
else:
df['workweek'][i]=df['workweek'][i-1]
else:
df['workweek'][i]=1
df['week_seq']=0
for i in range(len(df)):
if i==0:
df.loc[i,'week_seq']=1
else:
if df["workweek"][i]!=df["workweek"][i-1]:
df['week_seq'][i]=1
else:
df['week_seq'][i]=df['week_seq'][i-1] 1
e = time.time()
target_delta = e - s
df_target = df.copy()
del df
# faster approach
def add_cols(df):
f = lambda g: (g.day_delta.shift(1) > 1).cumsum() 1
df['workweek'] = df.groupby('id').apply(f).values
inner = lambda g: np.array(range(1, len(g) 1))
outer = lambda g: g.groupby('workweek').apply(inner)
df['week_seq'] = np.concatenate(df.groupby('id').apply(outer).values)
s = time.time()
df = pd.DataFrame(data, columns=['id','day_delta'])
add_cols(df)
e = time.time()
faster_delta = e - s
# test
pd.testing.assert_frame_equal(df, df_target)
# runtime
speedup = target_delta / faster_delta
print(f'~{speedup:.2f}x speedup')
This seems to be a good deal faster when I run it (typically >5x speedups). But I'm curious as to how this speedup translates to running on more data.