How to increment a new column based on value in another column in R-CodePudding

I would like to create two new columns based on a third one. These two columns should have incrementing values of two different kinds.

Let´s take the following dataset as an example:

events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
                     Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
                               0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))

I would like to create:

a column called "Number" incrementing everytime there is a sequence starting with 0 in the column "Value", and
a column called "Duration" starting from 1 everytime a new sequence of 0s is present in the column "Value" and incrementing with 1 as long as the sequence of 0s continues.

Ideally, the final data frame would be this one:

events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
                           Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
                                     0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
                           Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
                                      4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
                           Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
                                        3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))

I tried to use the tidyverse to do so, but I do not manage to get what I need [I am even very far from it]:

events %>%
  mutate(Number = ifelse(Value > 0, NA, 1),
         Duration = case_when(Value == 0 & lag(Value, n = 1) != 0 ~ 1,
                              Value == 0 & lag(Value, n = 1) == 0 ~ 2))

By looking for related questions, I found that this was feasible in SQL [https://stackoverflow.com/questions/42971752/increment-value-based-on-another-column]. I also know that this is quite easy to be done in Excel [the first Value is in the cell B2]:

Number column [Column C]: =IF(B2>0,0,IF(B1=0,C1,MAX(C$1:C1) 1))
Duration column [Column D]: =IF(B2>0,0,IF(B1=0,D1 1,1))

But I need to have it work in R ;-)

Any help is welcome :-)

CodePudding user response：

You can leverage data.table::rleid() twice here to solve the problem

library(data.table)
setDT(events)

events[, Number:=rleid(fifelse(Value==0,1,0))] %>% 
  .[Value==0,Number:=rleid(Number)] %>% 
  .[Value!=0,Number:=0] %>% 
  .[, Duration:=fifelse(Value==0, 1:.N,0), Number] %>% 
  .[]

Output:

    Frame Value Number Duration
 1:  1001  2.05      0        0
 2:  1002  0.00      1        1
 3:  1003  2.26      0        0
 4:  1004  2.38      0        0
 5:  1005  0.00      2        1
 6:  1006  0.00      2        2
 7:  1007  2.88      0        0
 8:  1008  0.32      0        0
 9:  1009  0.85      0        0
10:  1010  2.85      0        0
11:  1011  2.09      0        0
12:  1012  0.00      3        1
13:  1013  0.00      3        2
14:  1014  0.00      3        3
15:  1015  1.11      0        0
16:  1016  0.00      4        1
17:  1017  0.00      4        2
18:  1018  0.00      4        3
19:  1019  2.46      0        0
20:  1020  2.85      0        0
21:  1021  0.00      5        1
22:  1022  0.00      5        2
23:  1023  0.38      0        0
24:  1024  1.91      0        0
25:  1025  0.00      6        1
26:  1026  0.00      6        2
27:  1027  0.00      6        3
28:  1028  2.23      0        0
29:  1029  0.00      7        1
30:  1030  0.48      0        0
31:  1031  1.83      0        0
32:  1032  0.23      0        0
33:  1033  1.49      0        0

CodePudding user response：

Here is a tidyverse solution:

library(tidyverse)

events |>
  mutate(Number = replace(cumsum(Value == 0 & lag(Value != 0)), which(Value != 0), 0)) |>
  group_by(tmp = cumsum(Value == 0 & lag(Value != 0))) |>
  mutate(Duration = replace(row_number(), which(Value != 0), 0)) |>
  ungroup() |>
  select(-tmp)
#> # A tibble: 33 x 4
#>    Frame Value Number Duration
#>    <dbl> <dbl>  <dbl>    <dbl>
#>  1  1001  2.05      0        0
#>  2  1002  0         1        1
#>  3  1003  2.26      0        0
#>  4  1004  2.38      0        0
#>  5  1005  0         2        1
#>  6  1006  0         2        2
#>  7  1007  2.88      0        0
#>  8  1008  0.32      0        0
#>  9  1009  0.85      0        0
#> 10  1010  2.85      0        0
#> # ... with 23 more rows

CodePudding user response：

Here's a dplyr-based solution with a bit of cleverness for the Number column, but still relying on data.table::rleid for the Duration column:

events_final %>%
  mutate(
    add = Value == 0 & lag(Value) != 0,
    Number_result = cumsum(add) * (Value == 0),
    rle_0 = data.table::rleid(Value == 0)
  ) %>%
  group_by(rle_0) %>%
  mutate(
    Duration_result = ifelse(Value == 0, row_number(), 0)
  ) %>%
  ungroup() %>%
  select(-add, -rle_0)
# # A tibble: 33 × 6
#    Frame Value Number Duration Number_result Duration_result
#    <dbl> <dbl>  <dbl>    <dbl>         <int>           <dbl>
#  1  1001  2.05      0        0             0               0
#  2  1002  0         1        1             1               1
#  3  1003  2.26      0        0             0               0
#  4  1004  2.38      0        0             0               0
#  5  1005  0         2        1             2               1
#  6  1006  0         2        2             2               2
#  7  1007  2.88      0        0             0               0
#  8  1008  0.32      0        0             0               0
#  9  1009  0.85      0        0             0               0
# 10  1010  2.85      0        0             0               0
# # … with 23 more rows
# # ℹ Use `print(n = ...)` to see more rows

CodePudding user response：

Here is another (ugly) way to do it. Nowhere near as elegant as @langtang's solution but it works...

events <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
                     Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
                               0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49))

events_final <- data.frame(Frame = seq(from = 1001, to = 1033, by = 1),
                           Value = c(2.05, 0, 2.26, 2.38, 0, 0, 2.88, 0.32, 0.85, 2.85, 2.09, 0, 0, 0, 1.11, 0, 0,
                                     0, 2.46, 2.85, 0, 0, 0.38, 1.91, 0, 0, 0, 2.23, 0, 0.48, 1.83, 0.23, 1.49),
                           Number = c(0, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 3, 3, 3, 0, 4, 4,
                                      4, 0, 0, 5, 5, 0, 0, 6, 6, 6, 0, 7, 0, 0, 0, 0),
                           Duration = c(0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2,
                                        3, 0, 0, 1, 2, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0))
library(stringr)
events$Number <- events$Value == 0
events$tmp <- NA
i <- 0
lapply(2:nrow(events), function(x) {
  if ((events[ x, 'Number' ] == TRUE) & 
      (events[ x - 1, 'Number' ] == FALSE)) { 
    i <<- i   1
    events[ x, 'tmp' ] <<- i
  } else if ((events[ x, 'Number' ] == TRUE) & 
             (events[ x - 1, 'Number' ] == TRUE)) {
    events[ x, 'tmp' ] <<- i
  }
}) |> 
  invisible()
idx <- which(is.na(events$tmp))
events[ idx, 'tmp' ] <- 0
events <- split(events, events$tmp) |> 
  lapply(function(x) {
  if (unique(x$tmp) > 0) { 
    x$duration <- 1:nrow(x)
  } else {
    x$duration <- 0
  }
  x
}) |> 
  data.table::rbindlist(fill = TRUE) |>
  as.data.frame()

idx <- order(events$Frame)
events <- events[ idx, ]
events$Number <- NULL
colnames(events) <- c('Frame', 'Value', 'Number', 'Duration')
rownames(events) <- NULL
print(events)

identical(events, events_final)