I have this modified rolling list that I need for specific test submission analysis. This is my modified code:
import pandas as pd
df = pd.DataFrame(
data={
"Value": [5,10,5,5,10,5,5,5,15,5,5,10,15],
"Time": ['2022-06-22 13:56:25',
'2022-06-22 13:56:32',
'2022-06-22 13:56:39',
'2022-06-22 13:56:48',
'2022-06-22 13:58:49',
'2022-06-22 13:58:57',
'2022-06-22 13:59:28',
'2022-06-22 13:59:37',
'2022-06-22 13:59:46',
'2022-06-22 13:59:57',
'2022-06-22 14:00:06',
'2022-06-22 14:01:30',
'2022-06-22 14:02:11'],
}
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
print(df.info(verbose=True))
print(df)
class ValueRollingList:
def __init__(self,T='5T'):
self.cur = pd.DataFrame(columns=['Value','Time'])
self.window = pd.Timedelta(T)
self.new_df = pd.DataFrame()
self.last_window = list()
def __add(self, row):
idx = self.cur.index.max()
new_idx = idx 1 if idx==idx else 0
self.cur.loc[new_idx] = row[['Value','Time']]
def handle_row(self, row):
if len(self.cur) > 0:
## iterate through every unique Value except for 0
self.cur = self.cur[~self.cur.Value.eq(0)]
# create a time window of T and shape the original df to reflect
# only the records within that time window
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
if ~window_mask.all():
df = self.cur.loc[window_mask]
if len(df)>0:
df= df[~df.Time.isin(self.last_window)]
if len(self.cur)==11:
print("**************")
print("self.cur")
print(self.cur)
print("df")
print(df)
print('self.last_window')
print(self.last_window)
print('window_mask')
print(window_mask)
print('self.window')
print(self.window)
print((row['Time'] - self.cur['Time']).abs())
print((row['Time'] - self.cur['Time']))
print(row['Time'])
print(self.cur['Time'])
print("**************")
df2=df[['Value']].value_counts().reset_index(inplace=False)
df2.columns = ["Value","Count"]
# Only record the time windows that have more than one record
if len(df)>2:
print('#######################')
print(df['Time'])
print(len(df))
self.last_window = self.last_window (df["Time"].tolist())
print('-------------------')
print("self.last_window count: %d" %(len(self.last_window)))
print('-------------------')
print("df count: %d"%(len(df)))
print(df['Time'])
print(df)
if len(df)>0:
self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])
self.__add(row)
return
def dump_last(self):
return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)
# Instantiate the class that will count the unique values
# within the designated timeframe
rolling_list = ValueRollingList('5T')
s = df.apply(rolling_list.handle_row, axis=1)
#idx = s.index.max()
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)
The following are the printouts that I'm getting:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Value 13 non-null int64
1 Time 13 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 336.0 bytes
None
Value Time
0 5 2022-06-22 13:56:25
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 10 2022-06-22 14:01:30
12 15 2022-06-22 14:02:11
**************
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
df
Value Time
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
self.last_window
[]
window_mask
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
Name: Time, dtype: bool
self.window
0 days 00:05:00
0 0 days 00:05:05
1 0 days 00:04:58
2 0 days 00:04:51
3 0 days 00:04:42
4 0 days 00:02:41
5 0 days 00:02:33
6 0 days 00:02:02
7 0 days 00:01:53
8 0 days 00:01:44
9 0 days 00:01:33
10 0 days 00:01:24
Name: Time, dtype: timedelta64[ns]
0 0 days 00:05:05
1 0 days 00:04:58
2 0 days 00:04:51
3 0 days 00:04:42
4 0 days 00:02:41
5 0 days 00:02:33
6 0 days 00:02:02
7 0 days 00:01:53
8 0 days 00:01:44
9 0 days 00:01:33
10 0 days 00:01:24
Name: Time, dtype: timedelta64[ns]
2022-06-22 14:01:30
0 2022-06-22 13:56:25
1 2022-06-22 13:56:32
2 2022-06-22 13:56:39
3 2022-06-22 13:56:48
4 2022-06-22 13:58:49
5 2022-06-22 13:58:57
6 2022-06-22 13:59:28
7 2022-06-22 13:59:37
8 2022-06-22 13:59:46
9 2022-06-22 13:59:57
10 2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
**************
#######################
1 2022-06-22 13:56:32
2 2022-06-22 13:56:39
3 2022-06-22 13:56:48
4 2022-06-22 13:58:49
5 2022-06-22 13:58:57
6 2022-06-22 13:59:28
7 2022-06-22 13:59:37
8 2022-06-22 13:59:46
9 2022-06-22 13:59:57
10 2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
10
-------------------
self.last_window count: 10
-------------------
df count: 10
1 2022-06-22 13:56:32
2 2022-06-22 13:56:39
3 2022-06-22 13:56:48
4 2022-06-22 13:58:49
5 2022-06-22 13:58:57
6 2022-06-22 13:59:28
7 2022-06-22 13:59:37
8 2022-06-22 13:59:46
9 2022-06-22 13:59:57
10 2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
Value Time
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
ValCountTimeWin_df
Time 5 10 15
0 2022-06-22 14:00:06 7 2 1
Ideally, my FINAL output should look like this:
ValCountTimeWin_df
Time 5 10 15
0 2022-06-22 14:00:06 8 2 1
I believe the problem lies with the window_mask
:
window_mask
0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
Looking at self.cur
the first record should've been included but it is ignored because the window_mask says it's false. Why and how do I fix that?
self.cur
Value Time
0 5 2022-06-22 13:56:25
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
CodePudding user response:
The root cause of the issue is handle_row
function. In the first iteration of handle_row
your dataframe
self.cur
is empty, therefore it just adds one row because self.__add(row)
is called at the end, thus when you create the window_mask
the row that is being evaluated and the rows that are being subtracted in the following line are not matching, so for the row with index 11 would be:
# row.index = 11, self.cur.index = from 0 to 10
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
This means, the value of Time
that is being evaluated is 2022-06-22 14:01:30
, since your timedelta
is 5 minutes, the first row with Time
= 2022-06-22 13:56:25
is out of the range of the 5 minutes you assigned to self.window
. This is why you get False
in the index 0
.
An extra problem that maybe you didn't notice before, is that you are missing to process one row, because you are ignoring the first line with if len(self.cur) > 0:
since len(self.cur)
would be 0
for the first row that is trying to be processed with the df.apply()
Another issue appears when you create the class attribute df
(Note: I would suggest changing the name so you distinguish that it is the class attribute and not the original dataframe
, even though they are in different scopes
cold be confusing if you expand the class):
df = self.cur.loc[window_mask]
Since you are using loc[window_mask]
this means it will only select the rows where window_mask
is True
but you are evaluating in the self.cur
dataframe
that is evaluating one index before (As we saw in the first paragraph), so you are selecting the following elements as you can see from your output:
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
So when you calculate ValCountTimeWin_df
, as you can see, you have only 7 rows with the value 5
.
Fixing the issues:
At first, we have to be able to process all rows of the dataframe
so we use self.__add(row)
as the first line inside handle_row
method. This will make sure that self.cur
always processes the same data as the current row and avoid having the offset you had before.
To create the attribute df
we will use iloc
to be able to select all rows from 0 to the current row excluding it:
df = self.cur.iloc[:row.name]
Complete solution:
import pandas as pd
df = pd.DataFrame(
data={
"Value": [5,10,5,5,10,5,5,5,15,5,5,10,15],
"Time": ['2022-06-22 13:56:25',
'2022-06-22 13:56:32',
'2022-06-22 13:56:39',
'2022-06-22 13:56:48',
'2022-06-22 13:58:49',
'2022-06-22 13:58:57',
'2022-06-22 13:59:28',
'2022-06-22 13:59:37',
'2022-06-22 13:59:46',
'2022-06-22 13:59:57',
'2022-06-22 14:00:06',
'2022-06-22 14:01:30',
'2022-06-22 14:02:11'],
}
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
print(df.info(verbose=True))
print(df)
class ValueRollingList:
def __init__(self,T='5T'):
self.cur = pd.DataFrame(columns=['Value','Time'])
self.window = pd.Timedelta(T)
self.new_df = pd.DataFrame()
self.last_window = list()
def __add(self, row):
idx = self.cur.index.max()
new_idx = idx 1 if idx==idx else 0
self.cur.loc[new_idx] = row[['Value','Time']]
def handle_row(self, row):
# Add the row before anything else to make sure we process all rows
self.__add(row)
self.cur = self.cur[~self.cur.Value.eq(0)]
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
if ~window_mask.all():
# Select all rows from 0 to one before the current row in "self.cur"
# This would mean, process the rows from 0 to the current row (Including it) from the original data
df = self.cur.iloc[:row.name]
if len(df)>0:
df= df[~df.Time.isin(self.last_window)]
if len(self.cur)==11:
print("**************")
print("self.cur")
print(self.cur)
print("df")
print(df)
print('self.last_window')
print(self.last_window)
print('window_mask')
print(window_mask)
print('self.window')
print(self.window)
print((row['Time'] - self.cur['Time']).abs())
print((row['Time'] - self.cur['Time']))
print(row['Time'])
print(self.cur['Time'])
print("**************")
df2=df[['Value']].value_counts().reset_index(inplace=False)
df2.columns = ["Value","Count"]
# Only record the time windows that have more than one record
if len(df)>2:
print('#######################')
print(df['Time'])
print(len(df))
self.last_window = self.last_window (df["Time"].tolist())
print('-------------------')
print("self.last_window count: %d" %(len(self.last_window)))
print('-------------------')
print("df count: %d"%(len(df)))
print(df['Time'])
print(df)
if len(df)>0:
self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])
return
def dump_last(self):
return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)
rolling_list = ValueRollingList('5T')
df.apply(rolling_list.handle_row, axis=1)
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)
This returns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Value 13 non-null int64
1 Time 13 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 336.0 bytes
None
Value Time
0 5 2022-06-22 13:56:25
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
11 10 2022-06-22 14:01:30
12 15 2022-06-22 14:02:11
#######################
0 2022-06-22 13:56:25
1 2022-06-22 13:56:32
2 2022-06-22 13:56:39
3 2022-06-22 13:56:48
4 2022-06-22 13:58:49
5 2022-06-22 13:58:57
6 2022-06-22 13:59:28
7 2022-06-22 13:59:37
8 2022-06-22 13:59:46
9 2022-06-22 13:59:57
10 2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
11
-------------------
self.last_window count: 11
-------------------
df count: 11
0 2022-06-22 13:56:25
1 2022-06-22 13:56:32
2 2022-06-22 13:56:39
3 2022-06-22 13:56:48
4 2022-06-22 13:58:49
5 2022-06-22 13:58:57
6 2022-06-22 13:59:28
7 2022-06-22 13:59:37
8 2022-06-22 13:59:46
9 2022-06-22 13:59:57
10 2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
Value Time
0 5 2022-06-22 13:56:25
1 10 2022-06-22 13:56:32
2 5 2022-06-22 13:56:39
3 5 2022-06-22 13:56:48
4 10 2022-06-22 13:58:49
5 5 2022-06-22 13:58:57
6 5 2022-06-22 13:59:28
7 5 2022-06-22 13:59:37
8 15 2022-06-22 13:59:46
9 5 2022-06-22 13:59:57
10 5 2022-06-22 14:00:06
ValCountTimeWin_df
Time 5 10 15
0 2022-06-22 14:00:06 8 2 1