Create Sliding Window DataFrames with Timestamps-CodePudding

I need to create dataframes using a sliding window for multiple 24-hour dataframes covering the span of 2021-01-01 00:00:00 to 2021-12-31 23:56:00. Each interval between dataframes is 6 hours (hence start/end hours are 00,06,12,18). Doing this manually wouldnt be scalable, any input would be appreciated.

For example df1, df3, df5, and df7 will have Timestamp column with 24 hours of data.

start = '2021-01-01 00:00:00'
end = '2021-01-01 23:56:00'
df1 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

start = '2021-01-01 06:00:00'
end = '2021-01-02 05:56:00'
df3 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]
 
start = '2021-01-01 12:00:00'
end = '2021-01-02 11:56:00'
df5 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

start = '2021-01-01 18:00:00'
end = '2021-01-02 17:56:00'
df7 = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

The last dataframe dfx should have 24-hours being the last Timestamp = '2021-12-31 23:56:00'

start = '2021-12-31 00:00:00'
end = '2021-12-31 23:56:00'
dfx = main_df[(main_df.Timestamp >= start) & (main_df.Timestamp <= end)]

CodePudding user response：

One way to go about is to repeat your data 4 times, assign each with a label being the starting timestamp, then group:

freq = 6

periods = 24 // freq
shifted = pd.to_timedelta(np.arange(0,24,freq), unit='H')
group = df.Timestamp.dt.floor(f'{freq}H')


groups = pd.concat([
    df.assign(start=group-shift) for shift in shifted
]).groupby('start')

l = len(groups)

for i,(k,d) in enumerate(groups) :
    # only select the full sliding windows
    if periods - 1 <= i< l - periods:
        display(d)

you would get some thing like this:

              Timestamp      start
0   2021-01-01 00:00:00 2021-01-01
1   2021-01-01 00:04:00 2021-01-01
2   2021-01-01 00:08:00 2021-01-01
3   2021-01-01 00:12:00 2021-01-01
4   2021-01-01 00:16:00 2021-01-01
..                  ...        ...
355 2021-01-01 23:40:00 2021-01-01
356 2021-01-01 23:44:00 2021-01-01
357 2021-01-01 23:48:00 2021-01-01
358 2021-01-01 23:52:00 2021-01-01
359 2021-01-01 23:56:00 2021-01-01

CodePudding user response：

This works to create two lists of the start and end times that I need. For each of these pairs I will filter the 2021 dataframe down to a single 24-hour time period. The create_df_windows function follows the 2021 calendar year.

def create_df_windows():
  start_list = []
  end_list = []
  for a1 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12]:
    if (a1 == str(1).zfill(2) or a1 == str(3).zfill(2) or a1 == str(5).zfill(2) or a1 == str(7).zfill(2) or a1 == str(8).zfill(2) or a1 == str(10).zfill(2) or a1 == str(12).zfill(2)):
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True))
    elif a1 == str(2).zfill(2): #Feb
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True))
    else:
      for a2 in [str(1).zfill(2),str(2).zfill(2),str(3).zfill(2),str(4).zfill(2),str(5).zfill(2),str(6).zfill(2),str(7).zfill(2),str(8).zfill(2),str(9).zfill(2),10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]:
        for b1, b2 in zip([str(0).zfill(2),str(6).zfill(2),12,18], [23,str(5).zfill(2),11,17]):
          start = '2021-{d1}-{d2} {t1}:00:00'.format(d1 = a1, d2 = a2, t1 = b1)
          end = '2021-{d1}-{d3} {t2}:56:00'.format(d1 = a1, d3 = a2, t2 = b2)
          start_list.append(pd.to_datetime(start, infer_datetime_format=True))
          end_list.append(pd.to_datetime(end, infer_datetime_format=True)) 

  start_list.append(pd.to_datetime('2021-12-31 00:00:00', infer_datetime_format=True))
  end_list.append(pd.to_datetime('2021-12-31 23:56:00', infer_datetime_format=True))

  end_list2 = []
  for i in end_list:
    dt = pd.to_datetime(i, infer_datetime_format=True) timedelta(days=1)
    end_list2.append(dt)
  end_list2[0] = pd.to_datetime(end_list2[0], infer_datetime_format=True)-timedelta(days=1)
  return start_list, end_list2