my dataset consists of a series of behaviours observed in videos. For each behaviour, I have recorded when it start and when it end.
datain <-data.frame(
A=c("3/4 6/8 11/16","0/5"),
B=c("0/5 15/20","5/10"),
C=c("0/5","3/10"))
I would like to get the duration of each behaviour, like in this desired output
dataout <-data.frame(
A=c("3/4 6/8 11/16","0/5"),
B=c("0/5 10/5","5/10"),
C=c("0/5","3/10"),
A.sum=c(8,5),
B.sum=c(10,5),
C.sum=c(5,7))
I am experimenting with the following lines to identify which columns have the and to extract the rows with the interrupted behaviours (but I still have to calculate the duration of each behaviour), but I guess there could be more efficient solution than the one I am currently attempting.
d.1 <- lapply(datain, function(x) str_which(x,"\\ "))
d.2 <- which(lapply(d.1,length)>0)
coltosum <- match(names(d.2),colnames(datain))
mylist <- lapply(datain[coltosum],function(x) strsplit(x,"\\ "))
As always, I would greatly appreciate any suggestion.
Stef
CodePudding user response:
An option in base R
would be to loop over the columns (lapply
) of the dataset, then replace the digits (\\d
) followed by /
and digits to denominator - numerator by capturing those digits and switching the backreferences (\\2-\\1
), and eval(parse
the string
datain[paste0(names(datain), ".sum")] <- lapply(datain, function(y)
sapply(gsub("(\\d )/(\\d )", "(\\2-\\1)", y),
function(x) eval(parse(text = x))))
-checking with OP's output
> datain
A B C A.sum B.sum C.sum
1 3/4 6/8 11/16 0/5 15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
> dataout
A B C A.sum B.sum C.sum
1 3/4 6/8 11/16 0/5 10/5 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
Or with tidyverse
, group by rows, loop across
all the columns, read the string into a data.frame with read.table
, subtract the columns, get the sum
and return as new columns by modifying the .names
library(dplyr)
library(stringr)
datain %>%
rowwise %>%
mutate(across(everything(), ~ sum(with(read.table(text =
str_replace_all(.x, fixed(" "), "\n"), sep = "/",
header = FALSE), V2 - V1)), .names = "{.col}.sum")) %>%
ungroup
-output
# A tibble: 2 × 6
A B C A.sum B.sum C.sum
<chr> <chr> <chr> <int> <int> <int>
1 3/4 6/8 11/16 0/5 15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
CodePudding user response:
Another base R approach might be the following. First split by
, then split again by /
, taking the sum
of diff
erences in the resulting values.
datain[paste0(names(datain), ".sum")] <-
lapply(datain, function(x) {
sapply(strsplit(x, "[ ]"), function(y) {
sum(sapply(strsplit(y, "[/]"), function(z) {
diff(as.numeric(z)) }
))
})
})
datain
Output
A B C A.sum B.sum C.sum
1 3/4 6/8 11/16 0/5 15/20 0/5 8 10 5
2 0/5 5/10 3/10 5 5 7
CodePudding user response:
Update:
Slightly improved:
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\ |\\/", convert = TRUE) %>%
group_by(group = rleid(name)) %>%
mutate(value = value - lag(value, default = value[1])) %>%
slice(which(row_number() %% 2 == 0)) %>%
mutate(value = sum(value),
name = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-group) %>%
group_by(name) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = name, values_from = value) %>%
select(-id) %>%
cbind(datain)
This row
separate_rows(value, sep = "\\ |\\/", convert = TRUE) %>%
is same as
separate_rows(value, sep = "\\ ") %>%
separate_rows(value, sep = "\\/") %>%
type.convert(as.is = TRUE) %>%
The very very long way until finish: :-)
library(dplyr)
library(tidyr)
library(data.table)
datain %>%
pivot_longer(everything()) %>%
separate_rows(value, sep = "\\ ") %>%
separate_rows(value, sep = "\\/") %>%
group_by(group =as.integer(gl(n(),2,n()))) %>%
type.convert(as.is = TRUE) %>%
mutate(x = value - lag(value, default = value[1])) %>%
ungroup() %>%
group_by(group = rleid(name)) %>%
mutate(x = sum(x)) %>%
mutate(labels = paste0(name, ".sum")) %>%
slice(1) %>%
ungroup() %>%
select(-c(name, group, value)) %>%
pivot_wider(names_from = labels,
values_from = x,
values_fn = list) %>%
unnest(cols = c(A.sum, B.sum, C.sum)) %>%
cbind(datain)
A.sum B.sum C.sum A B C
1 8 10 5 3/4 6/8 11/16 0/5 15/20 0/5
2 5 5 7 0/5 5/10 3/10