I have a following data frame:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
I would like to remove zeros at the end of var1 for each id if the number of repeating zeros is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.
example:
df[df$id=="a",]
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
13 a 0
14 a 0
15 a 0
16 a 0
17 a 0
18 a 0
19 a 0
20 a 0
For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.
df[df$id=="b",]
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
CodePudding user response:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
library(data.table)
library(magrittr)
setDT(df)
to_remove <-
# get all run lengths of 0s
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
# only for ids with trailing 0s
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
# only if the last is longer than all previous for that id
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)]
to_remove
#> a h
#> 8 8
df[, head(.SD, .N - fcoalesce(to_remove[id], 0L))
, by = id]
#> id var1
#> 1: a 0
#> 2: a 1
#> 3: a 1
#> 4: a 0
#> 5: a 0
#> ---
#> 180: j 0
#> 181: j 0
#> 182: j 1
#> 183: j 0
#> 184: j 0
Created on 2021-11-24 by the reprex package (v2.0.1)
In a single chain:
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)] %>%
{df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}
# id var1
# 1: a 0
# 2: a 1
# 3: a 1
# 4: a 0
# 5: a 0
# ---
# 180: j 0
# 181: j 0
# 182: j 1
# 183: j 0
# 184: j 0
CodePudding user response:
We create a function with rle
- with two parameters 'x' and the threshold ('thresh'), apply the rle
(run-length-encoding) on the input 'x' (rle
- returns a list
output with lengths
and values
as two vectors). Check whether the last element of values
is 0 and its corresponding lengths
is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the rep
licated 'tmp1'.
Do a group by 'id' in ave
, apply the function and subset
the rows
f1 <- function(x, thresh) {
with(rle(x), {
tmp1 <- rep(TRUE, length(values))
tmp2 <- values[length(values)]
tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
rep(tmp1, lengths)
})
}
-testing
out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))
-output
> subset(out, id == 'a')
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
> subset(out, id == 'b')
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
CodePudding user response:
Version that is a single pipeline for no particular reason:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
df %>%
group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
data = purrr::map(data, ~rle(.x$var1)),
max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
last = purrr::map_int(data, ~{
.x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
})
) %>%
dplyr::mutate(
data = purrr::map(
data, ~{
if(max > last) {
x <- inverse.rle(.x)
len <- length(x)
x[(len - last):len] <- NA
x
} else {
inverse.rle(.x)
}
}
)
) %>%
dplyr::select(id, data) %>%
tidyr::unnest(c(id, data)) %>%
tidyr::drop_na()