Create a startime and endtime column from one timestamp column in R-CodePudding

I'm trying to identify when an event (cluster, 1 or 2) starts and stops, so I can use timeDiff and relate latlon data to weather data on these stop/start dates. Excluding location data for now, my data look like this (8000 rows, timestamps are 15 mins apart, 27 unique ids, clusters change within ids 50 times):

dput(head(mydata,80))structure(list(id = c(43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 
43486L, 43486L, 43486L), cluster = structure(c(2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", 
"2"), class = "factor"), idswitch = c("43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", 
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 1 1", 
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", 
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", 
"43486 2 2", "43486 2 2", "43486 2 2"), t_ = structure(c(1661663122, 
1661664035, 1661664948, 1661665900, 1661666812, 1661667724, 1661668637, 
1661669550, 1661670463, 1661671376, 1661672289, 1661673202, 1661674114, 
1661675026, 1661675939, 1661676852, 1661677764, 1661678676, 1661679588, 
1661680501, 1661681414, 1661682326, 1661683239, 1661684151, 1661685063, 
1661685975, 1661686888, 1661687800, 1661688712, 1661689624, 1661690536, 
1661691449, 1661692362, 1661693275, 1661694187, 1661695099, 1661696011, 
1661696923, 1661697835, 1661698747, 1661699659, 1661700571, 1661701483, 
1661702976, 1661703888, 1661704800, 1661705714, 1661706626, 1661707541, 
1661708454, 1661709366, 1661710279, 1661711191, 1661712104, 1661713017, 
1661713930, 1661714842, 1661715755, 1661716669, 1661717583, 1661718496, 
1661719410, 1661720324, 1661721236, 1661722149, 1661723061, 1661723973, 
1661724885, 1661725798, 1661726713, 1661727629, 1661728542, 1661729454, 
1661730370, 1661731283, 1661732196, 1661733109, 1661734022, 1661734934, 
1661735848), tzone = "", class = c("POSIXct", "POSIXt")), mode.switches = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)), row.names = c(NA, 
-80L), class = c("tbl_df", "tbl", "data.frame"))

Here is a (slightly modified) chunk to illustrate my problem:

id cluster idswitch  t_                  mode.switches
   <int> <fct>   <chr>     <dttm>                      <dbl>
 1 43486 2       43486 2 0 2022-08-28 00:05:22             0
 2 43486 2       43486 2 0 2022-08-28 00:20:35             0
 3 43486 2       43486 2 0 2022-08-28 00:35:48             0
 4 43486 2       43486 2 0 2022-08-28 00:51:40             0
 5 43486 2       43486 2 0 2022-08-28 01:06:52             0
 6 43486 1       43486 2 0 2022-08-28 01:22:04             0

I would like an output that looks like this:

idcluster   StartTime           EndTime  
\<chr\>      \<dttm\>              \<dttm\>  
1 43486 2 2022-08-28 00:05:22 2022-08-28 01:22:04  
2 43486 1 2022-08-28 01:22:04 2022-08-28 01:37:17
3 43486 2 2022-08-28 01:37:17 2022-08-28 02:07:43
4 43486 1 2022-08-28 02:07:43 2022-08-31 19:29:55

Where event 2 ends at the same time as event 1 starts.

I made id and cluster into one column using paste and made a new col, mode.switches, that sums how many times the event changes per id, thinking that might be helpful (spoiler, it wasn't). Some of my attempts below:

mydata <- mydata %>%
  group_by(id) %>%
  mutate(mode.switches = cumsum(c(0,as.numeric(diff(cluster))!=0))) %>%
  ungroup()

#First try---------
test <- all %>% 
  group_by(idswitch) %>%
  summarise(
    first = min(t_), 
    last = max(t_)
  )

#This is not what I want. An event stops and starts at the same time, and start/end times are wrong. Events  need to start when the event before them stops.
   idswitch   first               last               
   <chr>      <dttm>              <dttm>             
 1 43486 1 1  2022-08-28 16:59:33 2022-08-28 16:59:33
 2 43486 1 11 2022-08-30 14:12:25 2022-08-30 20:47:56
 3 43486 1 13 2022-08-30 21:18:20 2022-08-30 23:04:47
 4 43486 1 15 2022-08-31 14:40:47 2022-08-31 19:29:55
 5 43486 1 17 2022-08-31 20:00:22 2022-08-31 21:46:53

#lead/lag - it still respects idcluster so won't use t_  1 or -1 of another idcluster, but I need the times related to id and cluster....---------
test2 <- mydata %>%
  group_by(idcluster) %>%
  mutate(start=t_, stop=lead(t_, n=1))
sum(is.na(test$stop)) #606 NAs :(

#same problems------------------
test7<- all %>%
  filter(cluster == "1" | lag(cluster) == "1") %>%
  group_by(mode.switches, id, cluster, gr = cumsum(lag(cluster != "1", default = TRUE))) %>%
  summarise(StartTime = first(t_), 
            EndTime = last(t_)) %>%
  mutate(EndTime = replace(EndTime, StartTime == EndTime, NA)) %>%
  dplyr::select(-gr)
sum(is.na(test7$EndTime)) #378 NAs, lame.

CodePudding user response：

Thanks for adding the data.

You can first group_by the id. Then, filter and keep either the first row for a given id, or where there is a change in cluster.

Then, you can add an end column, which is the following row lead value of time.

library(dplyr)

df %>% 
  group_by(id) %>% 
  filter(row_number() == 1 | cluster != lag(cluster)) %>%
  mutate(end = lead(t_))

Output

     id cluster idswitch  t_                  mode.switches end                
  <int> <fct>   <chr>     <dttm>                      <dbl> <dttm>             
1 43486 2       43486 2 0 2022-08-28 01:05:22             0 2022-08-28 17:59:33
2 43486 1       43486 1 1 2022-08-28 17:59:33             1 2022-08-28 18:14:45
3 43486 2       43486 2 2 2022-08-28 18:14:45             2 NA