Home > Back-end >  How to split events / timestamps on weekdays
How to split events / timestamps on weekdays

Time:08-30

I have the following dataframe and can calculate the splits between the Startand End timestamp. It doesn't work for periods longer than 1 day and i can't concact my df properly:

import pandas as pd
import datetime as dt
df = pd.DataFrame({'Start':['2022-06-07 06:24:48','2022-06-07 14:37:16','2022-06-07 08:00:59'],
'End':['2022-06-07 14:07:00','2022-06-08 02:51:21','2022-06-09 13:18:34'],
'Process':['PROD','VORG','STO'], 
'Duration_Min':[462.20,734.08,3197.58]})

df['Start'] = pd.to_datetime(df['Start'])
df['End'] = pd.to_datetime(df['End'])

#Calculate the difference in days
df['difference']=df['End'].dt.date-df['Start'].dt.date

splits = df[df.End.dt.date > df.Start.dt.date].copy()

print(pd.concat([
    df,
    pd.DataFrame({
        'Start': list(splits.Start)   list(splits.End.dt.floor(freq='1D')),
        'End': list(splits.Start.dt.ceil(freq='1D'))   list(splits.End)})
]))

What I get:

Start                 End Process  Duration_Min difference
0 2022-06-07 06:24:48 2022-06-07 14:07:00    PROD        462.20     0 days
1 2022-06-07 14:37:16 2022-06-08 02:51:21    VORG        734.08     1 days
2 2022-06-07 08:00:59 2022-06-09 13:18:34     STO       3197.58     2 days
0 2022-06-07 14:37:16 2022-06-08 00:00:00     NaN           NaN        NaT
1 2022-06-07 08:00:59 2022-06-08 00:00:00     NaN           NaN        NaT
2 2022-06-08 00:00:00 2022-06-08 02:51:21     NaN           NaN        NaT
3 2022-06-09 00:00:00 2022-06-09 13:18:34     NaN           NaN        NaT

I would like to cut the events so that new timestamps with new intervals are created when the day changes. Days should be corosponding with weekday()

What I want:

Start                 End                   Process  Duration_Min  Days
0 2022-06-07 06:24:48 2022-06-07 14:07:00    PROD    462.200000     1
1 2022-06-07 14:37:16 2022-06-07 23:59:59    VORG    562.716667     1
2 2022-06-08 00:00:00 2022-06-08 02:51:21    VORG    171.350000     2
3 2022-06-07 08:00:59 2022-06-07 23:59:59     STO    959.000000     1
4 2022-06-08 00:00:00 2022-06-08 23:59:59     STO   1439.983333     2
5 2022-06-09 00:00:00 2022-06-09 13:18:34     STO    798.566667     3

CodePudding user response:

You can try this:

import pandas as pd

df = pd.DataFrame({'Start':['2022-06-07 06:24:48','2022-06-07 14:37:16','2022-06-07 08:00:59'],
                   'End':['2022-06-07 14:07:00','2022-06-08 02:51:21','2022-06-09 13:18:34'],
                   'Process':['PROD','VORG','STO'], 
                   'Duration_Min':[462.20,734.08,3197.58]})

df['Start'] = pd.to_datetime(df['Start'])
df['End'] = pd.to_datetime(df['End'])

print(df)

                Start                 End Process  Duration_Min
0 2022-06-07 06:24:48 2022-06-07 14:07:00    PROD        462.20
1 2022-06-07 14:37:16 2022-06-08 02:51:21    VORG        734.08
2 2022-06-07 08:00:59 2022-06-09 13:18:34     STO       3197.58

def split_dates(row):
    days = row.End.date() - row.Start.date()
    if days.days == 0:
        return

    dtr = pd.date_range(start=row.Start, end=row.End, freq='D', normalize=True)

    periods = []
    for i, t in enumerate(dtr):
        if i == 0:
            periods.append([t   pd.to_timedelta(str(row.Start.time()))])
            periods[i].append(t   pd.to_timedelta('23:59:59'))
            periods[i].append(row.Process)
            periods[i].append(round((periods[i][1] - periods[i][0]).seconds / 60, 2))
            continue
            
        if i == len(dtr) - 1:
            periods.append([t])
            periods[i].append(t   pd.to_timedelta(str(row.End.time())))
            periods[i].append(row.Process)
            periods[i].append(round((periods[i][1] - periods[i][0]).seconds / 60, 2))
            continue
            
        periods.append([t])
        periods[i].append(t   pd.to_timedelta('23:59:59'))
        periods[i].append(row.Process)
        periods[i].append(round((periods[i][1] - periods[i][0]).seconds / 60, 2))
    
    result = {'Start' : [], 'End' : [], 'Process' : [], 'Duration_Min' : []}
    for p in periods:
        result['Start'].append(p[0].strftime('%F %T'))
        result['End'].append(p[1].strftime('%F %T'))
        result['Process'].append(p[2])
        result['Duration_Min'].append(p[3])
        
    return result

res = pd.DataFrame()
for i, d in df.apply(split_dates, axis=1).items():
    if d:
        res = pd.concat([res, pd.DataFrame(d)])
        df = df.drop(index=i)

df = pd.concat([df, res], ignore_index=True)

df.Start = pd.to_datetime(df.Start)
df.End = pd.to_datetime(df.End)

df.sort_values(by=['Start', 'End'], inplace=True, ignore_index=True)

df['Days'] = df.Start.dt.weekday


print(df)

                Start                 End Process  Duration_Min  Days
0 2022-06-07 06:24:48 2022-06-07 14:07:00    PROD        462.20     1
1 2022-06-07 08:00:59 2022-06-07 23:59:59     STO        959.00     1
2 2022-06-07 14:37:16 2022-06-07 23:59:59    VORG        562.72     1
3 2022-06-08 00:00:00 2022-06-08 02:51:21    VORG        171.35     2
4 2022-06-08 00:00:00 2022-06-08 23:59:59     STO       1439.98     2
5 2022-06-09 00:00:00 2022-06-09 13:18:34     STO        798.57     3
  • Related