Count time in an 30 minutes interval in pandas-CodePudding

I got the following dataframe with two groups:

start_time	end_time	ID
10/10/2021 13:38	10/10/2021 14:30	A
31/10/2021 14:00	31/10/2021 15:00	A
21/10/2021 14:47	21/10/2021 15:30	B
23/10/2021 14:00	23/10/2021 15:30	B

I will ignore the date but only preserve the time for counting.

And I would like to create an 30 minutes interval as rows for each group first and then count, which should be similar to this:

start_interval	end_interval	count	ID
13:00	13:30	0	A
13:30	14:00	1	A
14:00	14:30	2	A
14:30	15:00	1	A
13:00	13:30	0	B
13:30	14:00	0	B
14:00	14:30	1	B
14:30	15:00	2	B
15:00	15:30	2	B

CodePudding user response：

Use:

#normalize all datetimes for 30 minutes
f = lambda x: pd.to_datetime(x).dt.floor('30Min')
df[["start_time", "end_time"]] = df[["start_time", "end_time"]].apply(f)

#get difference of 30 minutes
df['diff'] = df['end_time'].sub(df['start_time']).dt.total_seconds().div(1800).astype(int)
df['start_time'] = df['start_time'].sub(df['start_time'].dt.floor('d'))

#repeat by 30 minutes
df = df.loc[df.index.repeat(df['diff'])]
df['start_time']  = pd.to_timedelta(df.groupby(level=0).cumcount().mul(30), unit='Min')

#add starting dates - here 12:00
df1 = pd.DataFrame({'ID':df['ID'].unique(), 'start_time': pd.Timedelta(12, unit='H')})
print (df1)
  ID      start_time
0  A 0 days 12:00:00
1  B 0 days 12:00:00

df = pd.concat([df, df1])

#count per 30 minutes
df = df.set_index('start_time').groupby('ID').resample('30Min')['end_time'].count().reset_index(name='count')
#add end column

df['end_interval'] = df['start_time']   pd.Timedelta(30, unit='Min')
df = df.rename(columns={'start_time':'start_interval'})[['start_interval','end_interval','count','ID']]

print (df)
    start_interval    end_interval  count ID
0  0 days 12:00:00 0 days 12:30:00      0  A
1  0 days 12:30:00 0 days 13:00:00      0  A
2  0 days 13:00:00 0 days 13:30:00      0  A
3  0 days 13:30:00 0 days 14:00:00      1  A
4  0 days 14:00:00 0 days 14:30:00      2  A
5  0 days 14:30:00 0 days 15:00:00      1  A
6  0 days 12:00:00 0 days 12:30:00      0  B
7  0 days 12:30:00 0 days 13:00:00      0  B
8  0 days 13:00:00 0 days 13:30:00      0  B
9  0 days 13:30:00 0 days 14:00:00      0  B
10 0 days 14:00:00 0 days 14:30:00      1  B
11 0 days 14:30:00 0 days 15:00:00      2  B
12 0 days 15:00:00 0 days 15:30:00      2  B

CodePudding user response：

The input dataframe has start and end times. The resultant dataframe is a series of timestamps with 30min interval between them.

Here it is

# Import libs
import pandas as pd
from datetime import timedelta

# Sample Dataframe
df = pd.DataFrame(
    [
        ["10/10/2021 13:40", "10/10/2021 14:30", "A"],
        ["31/10/2021 14:00", "31/10/2021 15:00", "A"],
        ["21/10/2021 14:40", "21/10/2021 15:30", "B"],
        ["23/10/2021 14:00", "23/10/2021 15:30", "B"],
    ],
    columns=["start_time", "end_time", "ID"],
)

# convert to timedelta
df[["start_time", "end_time"]] = df[["start_time", "end_time"]].apply(
    lambda x: pd.to_datetime(x) - pd.to_datetime(x).dt.normalize()
)

# Extract seconds elapsed
df[["start_secs", "end_secs"]] = df[["start_time", "end_time"]].applymap(
    lambda x: x.seconds
)

# OUTPUT
#   start_time          end_time            ID  start_secs  end_secs
# 0     0 days 13:40:00     0 days 14:30:00     A   49200       52200
# 1     0 days 14:00:00     0 days 15:00:00     A   50400       54000
# 2     0 days 14:40:00     0 days 15:30:00     B   52800       55800
# 3     0 days 14:00:00     0 days 15:30:00     B   50400       55800

# Get rounded Min and Max time in secs of the dataframe
min_t, max_t = (df["start_secs"].min() // 3600) * 3600, (
    df["end_secs"].max() // 3600
) * 3600   3600


# Create Interval dataframe with 30min bins
interval_df = pd.DataFrame(
    map(lambda x: [x, x   30 * 60], range(min_t, max_t, 30 * 60)),
    columns=["start_interval", "end_interval"],
)

# OUTPUT
#   start_interval  end_interval
# 0     46800           48600
# 1     48600           50400
# 2     50400           52200
# 3     52200           54000
# 4     54000           55800
# 5     55800           57600

# It finds if the bin interval overlaps with the actual timeline and then count overlapping timelines of a single ID.
interval_df[["A", "B"]] = (
    df.groupby(["ID"])
    .apply(
        lambda x: x.apply(
            lambda y: ~(
                ((interval_df["end_interval"] - y["start_secs"]) <= 0)
                | ((interval_df["start_interval"] - y["end_secs"]) >= 0)
            ),
            axis=1,
        ).sum(axis=0)
    )
    .T
)

#   OUTPUT
#       start_interval  end_interval    A   B
# 0     46800           48600           0   0
# 1     48600           50400           1   0
# 2     50400           52200           2   1
# 3     52200           54000           1   2
# 4     54000           55800           0   2
# 5     55800           57600           0   0


# Convert seconds to time
interval_df[["start_interval", "end_interval"]] = interval_df[
    ["start_interval", "end_interval"]
].applymap(lambda x: str(timedelta(seconds=x)))


# Stack counts of A and B into one single column
interval_df.melt(["start_interval", "end_interval"])
# OUTPUT
#       start_interval  end_interval    variable    value
# 0     13:00:00        13:30:00        A           0
# 1     13:30:00        14:00:00        A           1
# 2     14:00:00        14:30:00        A           2
# 3     14:30:00        15:00:00        A           1
# 4     15:00:00        15:30:00        A           0
# 5     15:30:00        16:00:00        A           0
# 6     13:00:00        13:30:00        B           0
# 7     13:30:00        14:00:00        B           0
# 8     14:00:00        14:30:00        B           1
# 9     14:30:00        15:00:00        B           2
# 10    15:00:00        15:30:00        B           2
# 11    15:30:00        16:00:00        B           0