window_mask for timedelta window for rolling list is ignoring first seen record-CodePudding

I have this modified rolling list that I need for specific test submission analysis. This is my modified code:

import pandas as pd

df = pd.DataFrame(
    data={
        "Value": [5,10,5,5,10,5,5,5,15,5,5,10,15],
        "Time": ['2022-06-22 13:56:25',
        '2022-06-22 13:56:32',
        '2022-06-22 13:56:39',
        '2022-06-22 13:56:48',
        '2022-06-22 13:58:49',
        '2022-06-22 13:58:57',
        '2022-06-22 13:59:28',
        '2022-06-22 13:59:37',
        '2022-06-22 13:59:46',
        '2022-06-22 13:59:57',
        '2022-06-22 14:00:06',
        '2022-06-22 14:01:30',
        '2022-06-22 14:02:11'],
    }   
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
print(df.info(verbose=True))
print(df)
class ValueRollingList:
    def __init__(self,T='5T'):
        self.cur = pd.DataFrame(columns=['Value','Time'])
        self.window = pd.Timedelta(T)
        self.new_df = pd.DataFrame()
        self.last_window = list()

def __add(self, row):
    idx = self.cur.index.max()
    new_idx = idx 1 if idx==idx else 0
    self.cur.loc[new_idx] = row[['Value','Time']]
    
def handle_row(self, row):
    if len(self.cur) > 0:
        ## iterate through every unique Value except for 0
        self.cur = self.cur[~self.cur.Value.eq(0)]
        # create a time window of T and shape the original df to reflect
        # only the records within that time window
        window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
        if ~window_mask.all():
            df = self.cur.loc[window_mask]
            if len(df)>0:
                df= df[~df.Time.isin(self.last_window)]
                if len(self.cur)==11:
                    print("**************")
                    print("self.cur")
                    print(self.cur)
                    print("df")
                    print(df)
                    print('self.last_window')
                    print(self.last_window)
                    print('window_mask')
                    print(window_mask)
                    print('self.window')
                    print(self.window)
                    print((row['Time'] - self.cur['Time']).abs())
                    print((row['Time'] - self.cur['Time']))
                    print(row['Time'])
                    print(self.cur['Time'])
                    print("**************")
                df2=df[['Value']].value_counts().reset_index(inplace=False)
                df2.columns = ["Value","Count"]

                # Only record the time windows that have more than one record
                if len(df)>2:
                    print('#######################')
                    print(df['Time'])
                    print(len(df))
                    self.last_window = self.last_window   (df["Time"].tolist())
                    print('-------------------')
                    print("self.last_window count: %d" %(len(self.last_window)))
                    print('-------------------')
                    print("df count: %d"%(len(df)))
                    print(df['Time'])
                    print(df)
                    if len(df)>0:
                        self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])   
    self.__add(row)
    return

def dump_last(self):
    return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)


# Instantiate the class that will count the unique values
# within the designated timeframe
rolling_list = ValueRollingList('5T')
s = df.apply(rolling_list.handle_row, axis=1)
#idx = s.index.max()
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)

The following are the printouts that I'm getting:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Value   13 non-null     int64         
 1   Time    13 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 336.0 bytes
None
    Value                Time
0       5 2022-06-22 13:56:25
1      10 2022-06-22 13:56:32
2       5 2022-06-22 13:56:39
3       5 2022-06-22 13:56:48
4      10 2022-06-22 13:58:49
5       5 2022-06-22 13:58:57
6       5 2022-06-22 13:59:28
7       5 2022-06-22 13:59:37
8      15 2022-06-22 13:59:46
9       5 2022-06-22 13:59:57
10      5 2022-06-22 14:00:06
11     10 2022-06-22 14:01:30
12     15 2022-06-22 14:02:11
**************
self.cur
   Value                Time
0      5 2022-06-22 13:56:25
1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06
df
   Value                Time
1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06
self.last_window
[]
window_mask
0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
Name: Time, dtype: bool
self.window
0 days 00:05:00
0    0 days 00:05:05
1    0 days 00:04:58
2    0 days 00:04:51
3    0 days 00:04:42
4    0 days 00:02:41
5    0 days 00:02:33
6    0 days 00:02:02
7    0 days 00:01:53
8    0 days 00:01:44
9    0 days 00:01:33
10   0 days 00:01:24
Name: Time, dtype: timedelta64[ns]
0    0 days 00:05:05
1    0 days 00:04:58
2    0 days 00:04:51
3    0 days 00:04:42
4    0 days 00:02:41
5    0 days 00:02:33
6    0 days 00:02:02
7    0 days 00:01:53
8    0 days 00:01:44
9    0 days 00:01:33
10   0 days 00:01:24
Name: Time, dtype: timedelta64[ns]
2022-06-22 14:01:30
0    2022-06-22 13:56:25
1    2022-06-22 13:56:32
2    2022-06-22 13:56:39
3    2022-06-22 13:56:48
4    2022-06-22 13:58:49
5    2022-06-22 13:58:57
6    2022-06-22 13:59:28
7    2022-06-22 13:59:37
8    2022-06-22 13:59:46
9    2022-06-22 13:59:57
10   2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
**************
#######################
1    2022-06-22 13:56:32
2    2022-06-22 13:56:39
3    2022-06-22 13:56:48
4    2022-06-22 13:58:49
5    2022-06-22 13:58:57
6    2022-06-22 13:59:28
7    2022-06-22 13:59:37
8    2022-06-22 13:59:46
9    2022-06-22 13:59:57
10   2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
10
-------------------
self.last_window count: 10
-------------------
df count: 10
1    2022-06-22 13:56:32
2    2022-06-22 13:56:39
3    2022-06-22 13:56:48
4    2022-06-22 13:58:49
5    2022-06-22 13:58:57
6    2022-06-22 13:59:28
7    2022-06-22 13:59:37
8    2022-06-22 13:59:46
9    2022-06-22 13:59:57
10   2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
   Value                Time
1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06
ValCountTimeWin_df
                 Time  5  10  15
0 2022-06-22 14:00:06  7   2   1

Ideally, my FINAL output should look like this:

ValCountTimeWin_df
                 Time  5  10  15
0 2022-06-22 14:00:06  8   2   1

I believe the problem lies with the window_mask:

window_mask
0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True

Looking at self.cur the first record should've been included but it is ignored because the window_mask says it's false. Why and how do I fix that?

self.cur
   Value                Time
0      5 2022-06-22 13:56:25
1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06

CodePudding user response：

The root cause of the issue is handle_row function. In the first iteration of handle_row your dataframe self.cur is empty, therefore it just adds one row because self.__add(row) is called at the end, thus when you create the window_mask the row that is being evaluated and the rows that are being subtracted in the following line are not matching, so for the row with index 11 would be:

#             row.index = 11, self.cur.index = from 0 to 10
window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window

This means, the value of Time that is being evaluated is 2022-06-22 14:01:30, since your timedelta is 5 minutes, the first row with Time = 2022-06-22 13:56:25 is out of the range of the 5 minutes you assigned to self.window. This is why you get False in the index 0.

An extra problem that maybe you didn't notice before, is that you are missing to process one row, because you are ignoring the first line with if len(self.cur) > 0: since len(self.cur) would be 0 for the first row that is trying to be processed with the df.apply()

Another issue appears when you create the class attribute df (Note: I would suggest changing the name so you distinguish that it is the class attribute and not the original dataframe, even though they are in different scopes cold be confusing if you expand the class):

df = self.cur.loc[window_mask]

Since you are using loc[window_mask] this means it will only select the rows where window_mask is True but you are evaluating in the self.cur dataframe that is evaluating one index before (As we saw in the first paragraph), so you are selecting the following elements as you can see from your output:

1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06

So when you calculate ValCountTimeWin_df, as you can see, you have only 7 rows with the value 5.

Fixing the issues:

At first, we have to be able to process all rows of the dataframe so we use self.__add(row) as the first line inside handle_row method. This will make sure that self.cur always processes the same data as the current row and avoid having the offset you had before.

To create the attribute df we will use iloc to be able to select all rows from 0 to the current row excluding it:

df = self.cur.iloc[:row.name]

Complete solution:

import pandas as pd

df = pd.DataFrame(
    data={
        "Value": [5,10,5,5,10,5,5,5,15,5,5,10,15],
        "Time": ['2022-06-22 13:56:25',
        '2022-06-22 13:56:32',
        '2022-06-22 13:56:39',
        '2022-06-22 13:56:48',
        '2022-06-22 13:58:49',
        '2022-06-22 13:58:57',
        '2022-06-22 13:59:28',
        '2022-06-22 13:59:37',
        '2022-06-22 13:59:46',
        '2022-06-22 13:59:57',
        '2022-06-22 14:00:06',
        '2022-06-22 14:01:30',
        '2022-06-22 14:02:11'],
    }   
)
df["Time"] = pd.to_datetime(df["Time"],format='%Y-%m-%d %H:%M:%S')
print(df.info(verbose=True))
print(df)

class ValueRollingList:
    def __init__(self,T='5T'):
        self.cur = pd.DataFrame(columns=['Value','Time'])
        self.window = pd.Timedelta(T)
        self.new_df = pd.DataFrame()
        self.last_window = list()
    
    def __add(self, row):
        idx = self.cur.index.max()
        new_idx = idx 1 if idx==idx else 0
        self.cur.loc[new_idx] = row[['Value','Time']]
    
    def handle_row(self, row):
        # Add the row before anything else to make sure we process all rows
        self.__add(row)
        
        self.cur = self.cur[~self.cur.Value.eq(0)]
        
        window_mask = (row['Time'] - self.cur['Time']).abs() <= self.window
        
        if ~window_mask.all():
            # Select all rows from 0 to one before the current row in "self.cur"
            # This would mean, process the rows from 0 to the current row (Including it) from the original data
            df = self.cur.iloc[:row.name]
            if len(df)>0:
                df= df[~df.Time.isin(self.last_window)]
                if len(self.cur)==11:
                    print("**************")
                    print("self.cur")
                    print(self.cur)
                    print("df")
                    print(df)
                    print('self.last_window')
                    print(self.last_window)
                    print('window_mask')
                    print(window_mask)
                    print('self.window')
                    print(self.window)
                    print((row['Time'] - self.cur['Time']).abs())
                    print((row['Time'] - self.cur['Time']))
                    print(row['Time'])
                    print(self.cur['Time'])
                    print("**************")
                df2=df[['Value']].value_counts().reset_index(inplace=False)
                df2.columns = ["Value","Count"]

                # Only record the time windows that have more than one record
                if len(df)>2:
                    print('#######################')
                    print(df['Time'])
                    print(len(df))
                    self.last_window = self.last_window   (df["Time"].tolist())
                    print('-------------------')
                    print("self.last_window count: %d" %(len(self.last_window)))
                    print('-------------------')
                    print("df count: %d"%(len(df)))
                    print(df['Time'])
                    print(df)
                    if len(df)>0:
                        self.new_df=pd.concat([self.new_df,df.tail(1).reset_index(drop=True).drop(columns='Value').join(df2.pivot_table(columns='Value').reset_index(drop=True),how='outer')])   
        
        return
    
    def dump_last(self):
        return self.new_df.reset_index(inplace=False).drop('index',axis=1).fillna(0)
    
rolling_list = ValueRollingList('5T')
df.apply(rolling_list.handle_row, axis=1)
ValCountTimeWin_df = rolling_list.dump_last()
print("ValCountTimeWin_df")
print(ValCountTimeWin_df)

This returns:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Value   13 non-null     int64         
 1   Time    13 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 336.0 bytes
None
    Value                Time
0       5 2022-06-22 13:56:25
1      10 2022-06-22 13:56:32
2       5 2022-06-22 13:56:39
3       5 2022-06-22 13:56:48
4      10 2022-06-22 13:58:49
5       5 2022-06-22 13:58:57
6       5 2022-06-22 13:59:28
7       5 2022-06-22 13:59:37
8      15 2022-06-22 13:59:46
9       5 2022-06-22 13:59:57
10      5 2022-06-22 14:00:06
11     10 2022-06-22 14:01:30
12     15 2022-06-22 14:02:11
#######################
0    2022-06-22 13:56:25
1    2022-06-22 13:56:32
2    2022-06-22 13:56:39
3    2022-06-22 13:56:48
4    2022-06-22 13:58:49
5    2022-06-22 13:58:57
6    2022-06-22 13:59:28
7    2022-06-22 13:59:37
8    2022-06-22 13:59:46
9    2022-06-22 13:59:57
10   2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
11
-------------------
self.last_window count: 11
-------------------
df count: 11
0    2022-06-22 13:56:25
1    2022-06-22 13:56:32
2    2022-06-22 13:56:39
3    2022-06-22 13:56:48
4    2022-06-22 13:58:49
5    2022-06-22 13:58:57
6    2022-06-22 13:59:28
7    2022-06-22 13:59:37
8    2022-06-22 13:59:46
9    2022-06-22 13:59:57
10   2022-06-22 14:00:06
Name: Time, dtype: datetime64[ns]
   Value                Time
0      5 2022-06-22 13:56:25
1     10 2022-06-22 13:56:32
2      5 2022-06-22 13:56:39
3      5 2022-06-22 13:56:48
4     10 2022-06-22 13:58:49
5      5 2022-06-22 13:58:57
6      5 2022-06-22 13:59:28
7      5 2022-06-22 13:59:37
8     15 2022-06-22 13:59:46
9      5 2022-06-22 13:59:57
10     5 2022-06-22 14:00:06
ValCountTimeWin_df
                 Time  5  10  15
0 2022-06-22 14:00:06  8   2   1