I'm trying to identify when an event (cluster, 1 or 2) starts and stops, so I can use timeDiff and relate latlon data to weather data on these stop/start dates. Excluding location data for now, my data look like this (8000 rows, timestamps are 15 mins apart, 27 unique ids, clusters change within ids 50 times):
dput(head(mydata,80))structure(list(id = c(43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L), cluster = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), idswitch = c("43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 1 1",
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2",
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2",
"43486 2 2", "43486 2 2", "43486 2 2"), t_ = structure(c(1661663122,
1661664035, 1661664948, 1661665900, 1661666812, 1661667724, 1661668637,
1661669550, 1661670463, 1661671376, 1661672289, 1661673202, 1661674114,
1661675026, 1661675939, 1661676852, 1661677764, 1661678676, 1661679588,
1661680501, 1661681414, 1661682326, 1661683239, 1661684151, 1661685063,
1661685975, 1661686888, 1661687800, 1661688712, 1661689624, 1661690536,
1661691449, 1661692362, 1661693275, 1661694187, 1661695099, 1661696011,
1661696923, 1661697835, 1661698747, 1661699659, 1661700571, 1661701483,
1661702976, 1661703888, 1661704800, 1661705714, 1661706626, 1661707541,
1661708454, 1661709366, 1661710279, 1661711191, 1661712104, 1661713017,
1661713930, 1661714842, 1661715755, 1661716669, 1661717583, 1661718496,
1661719410, 1661720324, 1661721236, 1661722149, 1661723061, 1661723973,
1661724885, 1661725798, 1661726713, 1661727629, 1661728542, 1661729454,
1661730370, 1661731283, 1661732196, 1661733109, 1661734022, 1661734934,
1661735848), tzone = "", class = c("POSIXct", "POSIXt")), mode.switches = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)), row.names = c(NA,
-80L), class = c("tbl_df", "tbl", "data.frame"))
Here is a (slightly modified) chunk to illustrate my problem:
id cluster idswitch t_ mode.switches
<int> <fct> <chr> <dttm> <dbl>
1 43486 2 43486 2 0 2022-08-28 00:05:22 0
2 43486 2 43486 2 0 2022-08-28 00:20:35 0
3 43486 2 43486 2 0 2022-08-28 00:35:48 0
4 43486 2 43486 2 0 2022-08-28 00:51:40 0
5 43486 2 43486 2 0 2022-08-28 01:06:52 0
6 43486 1 43486 2 0 2022-08-28 01:22:04 0
I would like an output that looks like this:
idcluster StartTime EndTime
\<chr\> \<dttm\> \<dttm\>
1 43486 2 2022-08-28 00:05:22 2022-08-28 01:22:04
2 43486 1 2022-08-28 01:22:04 2022-08-28 01:37:17
3 43486 2 2022-08-28 01:37:17 2022-08-28 02:07:43
4 43486 1 2022-08-28 02:07:43 2022-08-31 19:29:55
Where event 2 ends at the same time as event 1 starts.
I made id and cluster into one column using paste and made a new col, mode.switches, that sums how many times the event changes per id, thinking that might be helpful (spoiler, it wasn't). Some of my attempts below:
mydata <- mydata %>%
group_by(id) %>%
mutate(mode.switches = cumsum(c(0,as.numeric(diff(cluster))!=0))) %>%
ungroup()
#First try---------
test <- all %>%
group_by(idswitch) %>%
summarise(
first = min(t_),
last = max(t_)
)
#This is not what I want. An event stops and starts at the same time, and start/end times are wrong. Events need to start when the event before them stops.
idswitch first last
<chr> <dttm> <dttm>
1 43486 1 1 2022-08-28 16:59:33 2022-08-28 16:59:33
2 43486 1 11 2022-08-30 14:12:25 2022-08-30 20:47:56
3 43486 1 13 2022-08-30 21:18:20 2022-08-30 23:04:47
4 43486 1 15 2022-08-31 14:40:47 2022-08-31 19:29:55
5 43486 1 17 2022-08-31 20:00:22 2022-08-31 21:46:53
#lead/lag - it still respects idcluster so won't use t_ 1 or -1 of another idcluster, but I need the times related to id and cluster....---------
test2 <- mydata %>%
group_by(idcluster) %>%
mutate(start=t_, stop=lead(t_, n=1))
sum(is.na(test$stop)) #606 NAs :(
#same problems------------------
test7<- all %>%
filter(cluster == "1" | lag(cluster) == "1") %>%
group_by(mode.switches, id, cluster, gr = cumsum(lag(cluster != "1", default = TRUE))) %>%
summarise(StartTime = first(t_),
EndTime = last(t_)) %>%
mutate(EndTime = replace(EndTime, StartTime == EndTime, NA)) %>%
dplyr::select(-gr)
sum(is.na(test7$EndTime)) #378 NAs, lame.
Lots of similar questions exist, but none seem to solve my problem: Identify start and stop sequences and length of run in R Create start and end time column from a single datetime column in R R Start Column & End Column Time Log Obtaining the start and end dates from a single column of data How to extract start and end dates from single column in large dataset in R using lubridate? R trying to create start and stop times from single column Create start and endtime columns based on multiple conditions in R (dplyr, lubridate)
CodePudding user response:
Thanks for adding the data.
You can first group_by
the id
. Then, filter
and keep either the first row for a given id
, or where there is a change in cluster
.
Then, you can add an end
column, which is the following row lead
value of time.
library(dplyr)
df %>%
group_by(id) %>%
filter(row_number() == 1 | cluster != lag(cluster)) %>%
mutate(end = lead(t_))
Output
id cluster idswitch t_ mode.switches end
<int> <fct> <chr> <dttm> <dbl> <dttm>
1 43486 2 43486 2 0 2022-08-28 01:05:22 0 2022-08-28 17:59:33
2 43486 1 43486 1 1 2022-08-28 17:59:33 1 2022-08-28 18:14:45
3 43486 2 43486 2 2 2022-08-28 18:14:45 2 NA