remove trailing zeros from column based on condition-CodePudding

I have a following data frame:

set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)   
df<-data.frame(id, var1)

I would like to remove zeros at the end of var1 for each id if the number of repeating zeros is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.

example:

df[df$id=="a",]

 id var1
1   a    0
2   a    1
3   a    1
4   a    0
5   a    0
6   a    0
7   a    0
8   a    0
9   a    0
10  a    0
11  a    0
12  a    1
13  a    0
14  a    0
15  a    0
16  a    0
17  a    0
18  a    0
19  a    0
20  a    0

For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:

For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.

df[df$id=="b",]


 id var1
21  b    0
22  b    1
23  b    0
24  b    0
25  b    0
26  b    0
27  b    0
28  b    0
29  b    0
30  b    0
31  b    0
32  b    1
33  b    1
34  b    1
35  b    0
36  b    1
37  b    0
38  b    1
39  b    0
40  b    0

CodePudding user response：

set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)   
df<-data.frame(id, var1)

library(data.table)
library(magrittr)

setDT(df)

to_remove <- 
  # get all run lengths of 0s
  df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>% 
    # only for ids with trailing 0s
    .[df[, if (last(var1) == 0) id, id], on = .(id)] %>% 
    # only if the last is longer than all previous for that id
    .[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>% 
    .[, setNames(n_rem, id)]

to_remove
#> a h 
#> 8 8

df[,  head(.SD, .N - fcoalesce(to_remove[id], 0L))
   , by = id] 
#>      id var1
#>   1:  a    0
#>   2:  a    1
#>   3:  a    1
#>   4:  a    0
#>   5:  a    0
#>  ---        
#> 180:  j    0
#> 181:  j    0
#> 182:  j    1
#> 183:  j    0
#> 184:  j    0

^{Created on 2021-11-24 by the reprex package (v2.0.1)}

In a single chain:

df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>% 
  .[df[, if (last(var1) == 0) id, id], on = .(id)] %>% 
  .[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>% 
  .[, setNames(n_rem, id)] %>% 
  {df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}

#      id var1
#   1:  a    0
#   2:  a    1
#   3:  a    1
#   4:  a    0
#   5:  a    0
#  ---        
# 180:  j    0
# 181:  j    0
# 182:  j    1
# 183:  j    0
# 184:  j    0

CodePudding user response：

We create a function with rle - with two parameters 'x' and the threshold ('thresh'), apply the rle (run-length-encoding) on the input 'x' (rle - returns a list output with lengths and values as two vectors). Check whether the last element of values is 0 and its corresponding lengths is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the replicated 'tmp1'.

Do a group by 'id' in ave, apply the function and subset the rows

f1 <-  function(x, thresh) {
     with(rle(x), {
         tmp1 <- rep(TRUE, length(values))
        tmp2 <- values[length(values)]
         tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
     rep(tmp1, lengths)
 })
}

-testing

out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))

-output

> subset(out, id == 'a')
   id var1
1   a    0
2   a    1
3   a    1
4   a    0
5   a    0
6   a    0
7   a    0
8   a    0
9   a    0
10  a    0
11  a    0
12  a    1
> subset(out, id == 'b')
   id var1
21  b    0
22  b    1
23  b    0
24  b    0
25  b    0
26  b    0
27  b    0
28  b    0
29  b    0
30  b    0
31  b    0
32  b    1
33  b    1
34  b    1
35  b    0
36  b    1
37  b    0
38  b    1
39  b    0
40  b    0

CodePudding user response：

Version that is a single pipeline for no particular reason:

set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)   
df<-data.frame(id, var1)

df %>%
    group_by(id) %>%
    tidyr::nest() %>%
    dplyr::mutate(
        data = purrr::map(data, ~rle(.x$var1)),
        max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
        last = purrr::map_int(data, ~{
            .x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
        })
    ) %>%
    dplyr::mutate(
        data = purrr::map(
            data, ~{
                if(max > last) {
                    x <- inverse.rle(.x)
                    len <- length(x)
                    x[(len - last):len] <- NA
                    x
                } else {
                    inverse.rle(.x)
                }
            }
        )
    ) %>%
    dplyr::select(id, data) %>%
    tidyr::unnest(c(id, data)) %>%
    tidyr::drop_na()