I have a following problem. My data has this structure:
import pandas as pd
import numpy as np
input = {
"Name": ["Tom", "Tom", "nick", "krish", "krish", "jack", "krish"],
"Age": [20, 20, 21, 19, 19, 18, 19],
"Time": [
"2021-09-23 00:01:00",
"2021-09-24 00:02:00",
"2021-09-23 00:01:00",
"2021-09-23 00:01:00",
"2021-09-23 00:10:00",
"2021-09-23 00:01:00",
"2021-09-25 00:03:00",
],
"Value": [1, 5, 1, 1, 17, 2, 8],
}
df_input = pd.DataFrame(input)
I would like to calculate difference in minutes based on:
Name
- and
Value
starts with 1 and ends with 9 or 17.
Desired output is:
output = {
"Name": ["Tom", "Tom", "nick", "krish", "krish", "jack", "krish"],
"Age": [20, 20, 21, 19, 19, 18, 19],
"Time": [
"2021-09-23 00:01:00",
"2021-09-24 00:02:00",
"2021-09-23 00:01:00",
"2021-09-23 00:01:00",
"2021-09-23 00:10:00",
"2021-09-23 00:01:00",
"2021-09-25 00:03:00",
],
"Value": [1, 5, 1, 1, 17, 2, 8],
"Diff_hours": [np.NaN, np.NaN, np.NaN, # becuase no 9 or 17 at the end in Value
9, # because 2021-09-23 00:01:00 minus 2021-09-23 00:10:00
9,
np.NaN, # because neither 1 at beginning and 9 or 17 at the end in Value
9
],
}
df_output = pd.DataFrame(output)
I found this, but it did not help me: Time difference in day based on specific condition in pandas
CodePudding user response:
check out this code and don't hesitate to ask for more clarification:
from datetime import datetime as dt
def check(df):
startMet= False
endMet = False
start = -1
end = -1
for row_idx in range(df.shape[0]):
if(not(startMet) and str(df.iloc[row_idx,3])[0]=="1"):
startMet = True
start = row_idx
if(not(endMet) and ( str(df.iloc[row_idx,3])[-1]=='9' or str(df.iloc[row_idx,3])[-2:]=='17') ):
endMet = True
end = row_idx
if startMet and endMet:
if start==end:
return None
delta = dt.strptime(df.iloc[end,2], "%Y-%m-%d %H:%M:%S")- dt.strptime(df.iloc[start,2], "%Y-%m-%d %H:%M:%S")
seconds = delta.total_seconds()
minutes = (seconds//60)%60
return int(minutes)
else:
return None
for name in names:
df_input.loc[df_input['Name']==name, "Diff_hours"]=check(df_input.loc[df_input['Name']==name, :])
df_input
this gives you the exact output that you wanted.
CodePudding user response:
Solution I come with, but there might be better one:
help = df_input[["Name", "Time", "Value"]]
help = help[(help["Value"] == 1 ) | (help["Value"] == 9 ) | (help["Value"] == 17 ) ]
help["Time"] = pd.to_datetime(help["Time"])
help['diff'] = help.sort_values(['Name','Time']).groupby('Name')['Time'].diff()
help['diff'] = help['diff'].fillna(pd.Timedelta(seconds=0))
help['diff'] = help['diff'].dt.total_seconds().div(60).astype(int)
help = help[help["diff"] != 0][["Name", "diff"]]
df_output = df_input.merge(
help, how="left", on="Name"
)
print(df_output)