I would like to create two new columns based on a third one. These two columns should have incrementing values of two different kinds.
Let´s take the following dataset as an example:
events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))
I would like to create:
- a column called "Number" incrementing everytime there is a sequence starting with 0 in the column "Value", and
- a column called "Duration" starting from 1 everytime a new sequence of 0s is present in the column "Value" and incrementing with 1 as long as the sequence of 0s continues.
Ideally, the final data frame would be this one:
events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))
I tried to use the tidyverse
to do so, but I do not manage to get what I need [I am even very far from it]:
events %>%
mutate(Number = ifelse(Value > 0, NA, 1),
Duration = case_when(Value == 0 & lag(Value, n = 1) != 0 ~ 1,
Value == 0 & lag(Value, n = 1) == 0 ~ 2))
By looking for related questions, I found that this was feasible in SQL [https://stackoverflow.com/questions/42971752/increment-value-based-on-another-column]. I also know that this is quite easy to be done in Excel [the first Value is in the cell B2]:
- Number column [Column C]: =IF(B2>0,0,IF(B1=0,C1,MAX(C$1:C1) 1))
- Duration column [Column D]: =IF(B2>0,0,IF(B1=0,D1 1,1))
But I need to have it work in R ;-)
Any help is welcome :-)
CodePudding user response:
You can leverage data.table::rleid()
twice here to solve the problem
library(data.table)
setDT(events)
events[, Number:=rleid(fifelse(Value==0,1,0))] %>%
.[Value==0,Number:=rleid(Number)] %>%
.[Value!=0,Number:=0] %>%
.[, Duration:=fifelse(Value==0, 1:.N,0), Number] %>%
.[]
Output:
Frame Value Number Duration
1: 1001 2.05 0 0
2: 1002 0.00 1 1
3: 1003 2.26 0 0
4: 1004 2.38 0 0
5: 1005 0.00 2 1
6: 1006 0.00 2 2
7: 1007 2.88 0 0
8: 1008 0.32 0 0
9: 1009 0.85 0 0
10: 1010 2.85 0 0
11: 1011 2.09 0 0
12: 1012 0.00 3 1
13: 1013 0.00 3 2
14: 1014 0.00 3 3
15: 1015 1.11 0 0
16: 1016 0.00 4 1
17: 1017 0.00 4 2
18: 1018 0.00 4 3
19: 1019 2.46 0 0
20: 1020 2.85 0 0
21: 1021 0.00 5 1
22: 1022 0.00 5 2
23: 1023 0.38 0 0
24: 1024 1.91 0 0
25: 1025 0.00 6 1
26: 1026 0.00 6 2
27: 1027 0.00 6 3
28: 1028 2.23 0 0
29: 1029 0.00 7 1
30: 1030 0.48 0 0
31: 1031 1.83 0 0
32: 1032 0.23 0 0
33: 1033 1.49 0 0
CodePudding user response:
Here is a tidyverse
solution:
library(tidyverse)
events |>
mutate(Number = replace(cumsum(Value == 0 & lag(Value != 0)), which(Value != 0), 0)) |>
group_by(tmp = cumsum(Value == 0 & lag(Value != 0))) |>
mutate(Duration = replace(row_number(), which(Value != 0), 0)) |>
ungroup() |>
select(-tmp)
#> # A tibble: 33 x 4
#> Frame Value Number Duration
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1001 2.05 0 0
#> 2 1002 0 1 1
#> 3 1003 2.26 0 0
#> 4 1004 2.38 0 0
#> 5 1005 0 2 1
#> 6 1006 0 2 2
#> 7 1007 2.88 0 0
#> 8 1008 0.32 0 0
#> 9 1009 0.85 0 0
#> 10 1010 2.85 0 0
#> # ... with 23 more rows
CodePudding user response:
Here's a dplyr
-based solution with a bit of cleverness for the Number
column, but still relying on data.table::rleid
for the Duration
column:
events_final %>%
mutate(
add = Value == 0 & lag(Value) != 0,
Number_result = cumsum(add) * (Value == 0),
rle_0 = data.table::rleid(Value == 0)
) %>%
group_by(rle_0) %>%
mutate(
Duration_result = ifelse(Value == 0, row_number(), 0)
) %>%
ungroup() %>%
select(-add, -rle_0)
# # A tibble: 33 × 6
# Frame Value Number Duration Number_result Duration_result
# <dbl> <dbl> <dbl> <dbl> <int> <dbl>
# 1 1001 2.05 0 0 0 0
# 2 1002 0 1 1 1 1
# 3 1003 2.26 0 0 0 0
# 4 1004 2.38 0 0 0 0
# 5 1005 0 2 1 2 1
# 6 1006 0 2 2 2 2
# 7 1007 2.88 0 0 0 0
# 8 1008 0.32 0 0 0 0
# 9 1009 0.85 0 0 0 0
# 10 1010 2.85 0 0 0 0
# # … with 23 more rows
# # ℹ Use `print(n = ...)` to see more rows
CodePudding user response:
Here is another (ugly) way to do it. Nowhere near as elegant as @langtang's solution but it works...
events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))
events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))
library(stringr)
events$Number <- events$Value == 0
events$tmp <- NA
i <- 0
lapply(2:nrow(events), function(x) {
if ((events[ x, 'Number' ] == TRUE) &
(events[ x - 1, 'Number' ] == FALSE)) {
i <<- i 1
events[ x, 'tmp' ] <<- i
} else if ((events[ x, 'Number' ] == TRUE) &
(events[ x - 1, 'Number' ] == TRUE)) {
events[ x, 'tmp' ] <<- i
}
}) |>
invisible()
idx <- which(is.na(events$tmp))
events[ idx, 'tmp' ] <- 0
events <- split(events, events$tmp) |>
lapply(function(x) {
if (unique(x$tmp) > 0) {
x$duration <- 1:nrow(x)
} else {
x$duration <- 0
}
x
}) |>
data.table::rbindlist(fill = TRUE) |>
as.data.frame()
idx <- order(events$Frame)
events <- events[ idx, ]
events$Number <- NULL
colnames(events) <- c('Frame', 'Value', 'Number', 'Duration')
rownames(events) <- NULL
print(events)
identical(events, events_final)