This dataframe:
df <- structure(list(Subject = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
Timestamp = c("00:00:00.146 - 00:00:00.889",
"00:00:00.146 - 00:00:00.889", "00:00:01.568 - 00:00:02.183",
"00:00:01.568 - 00:00:02.183", "00:00:03.642 - 00:00:04.522",
"00:00:03.642 - 00:00:04.522", "00:00:00.000 - 00:00:00.660",
"00:00:01.247 - 00:00:02.229", "00:00:01.247 - 00:00:02.229",
"00:00:03.697 - 00:00:04.926", "00:00:03.697 - 00:00:04.926"),
Starttime_ms = c(0, 146, 889, 1568, 2183, 3642, 0, 660, 1247, 2229, 3697),
Endtime_ms = c(146, 889, 1568, 2183, 3642, 4522,660, 1247, 2229, 3697, 4926),
Duration = c(146, 743, 679, 615, 1459, 880, 660, 587, 982, 1468, 1229),
Event = c("nf", "f", "nf", "f", "nf", "f", "f", "nf", "f", "nf", "f")),
row.names = c(NA, -11L), class = c("tbl_df", "tbl", "data.frame"))
contains time measurements of Subject
s' ratings of an Event
during a timespan from 0 ms to roughly but not exactly 5000 ms. What is missing is the Subject
-final measurement up to the exact limit of 5000 ms. How can I insert a single row for that missing observation for each Subject
? (The missing value for Event
must be nf
.)
Expected:
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.146 - 00:00:00.889 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:01.568 - 00:00:02.183 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:03.642 - 00:00:04.522 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 A 00:00:04.522 - 00:00:05.000 4522 5000 478 nf # <-- inserted
8 B 00:00:00.000 - 00:00:00.660 0 660 660 f
9 B 00:00:01.247 - 00:00:02.229 660 1247 587 nf
10 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
11 B 00:00:03.697 - 00:00:04.926 2229 3697 1468 nf
12 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
13 B 00:00:04.926 - 00:00:05.000 4926 5000 74 nf # <-- inserted
A dplyr
solution is preferred.
EDIT:
Came up with this solution:
library(tidyr)
library(dplyr)
df %>%
extract(Timestamp, into = c("start", "end"), regex = "(. ) - (. )") %>%
group_by(Subject) %>%
mutate(ID = row_number()) %>%
filter(ID == max(ID)) %>%
mutate(Duration = 5000 - Endtime_ms,
Event = sub("f", "nf", Event),
Starttime_ms = Endtime_ms,
Endtime_ms = 5000,
end = "00:00:05.000",
start = format(as.POSIXct(Starttime_ms/1000, "UTC", origin = "1970-01-01"), "%H:%M:%OS3"),
Timestamp = str_c(start, " - ", end)) %>%
# deselect not needed cols:
select(-c(start, end,ID)) %>%
# join back to original df:
bind_rows(.,df) %>%
# re-order:
arrange(Subject, Timestamp)
CodePudding user response:
Update:
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
separate_rows(Timestamp, sep = " - ") %>%
mutate(Timestamp1 = lag(Timestamp, default = "00:00:00.000"), .before=Timestamp) %>%
group_by(Subject) %>%
slice(c(1:n(),n())) %>%
mutate(Timestamp = c(Timestamp[-n()], "00:00:05.000")) %>%
mutate(Timestamp1 = c(Timestamp1[-n()], nth(Timestamp, -2))) %>%
mutate(Starttime_ms = as.numeric(hms(Timestamp1))*1000,
Endtime_ms = as.numeric(hms(Timestamp))*1000,
Duration = Endtime_ms-Starttime_ms) %>%
mutate(Timestamp = paste(Timestamp1, Timestamp, sep = " - ")) %>%
left_join(df, by="Timestamp") %>%
mutate(Event.y = replace_na(Event.y, "nf")) %>%
select(1, 3:6, Event.y) %>%
filter(Endtime_ms.x !=0) %>%
rename_with(., ~str_replace(., '\\.\\w', ''))
Subject Timestamp Starttime_ms Endtime_ms Duration Event
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 A 00:00:00.000 - 00:00:00.146 0 146 146 nf
2 A 00:00:00.146 - 00:00:00.889 146 889 743 f
3 A 00:00:00.889 - 00:00:01.568 889 1568 679 nf
4 A 00:00:01.568 - 00:00:02.183 1568 2183 615 f
5 A 00:00:02.183 - 00:00:03.642 2183 3642 1459 nf
6 A 00:00:03.642 - 00:00:04.522 3642 4522 880 f
7 A 00:00:04.522 - 00:00:05.000 4522 5000 478 nf
8 B 00:00:00.000 - 00:00:00.660 0 660 660 f
9 B 00:00:00.660 - 00:00:01.247 660 1247 587 nf
10 B 00:00:01.247 - 00:00:02.229 1247 2229 982 f
11 B 00:00:02.229 - 00:00:03.697 2229 3697 1468 nf
12 B 00:00:03.697 - 00:00:04.926 3697 4926 1229 f
13 B 00:00:04.926 - 00:00:05.000 4926 5000 74 nf