I have a dataframe of '0's and '1's, like so:
DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
I want to know how many times in each row a '0' is followed by a '1' in the next column. If the first column value is a '1', this should also be counted.
I have a loop which binds each row into a vector and then counts the number of '01's using either stringi::stri_count_fixed
or stringr::str_count
:
for(n in 1:nrow(DATA)) {
# Paste row into a single character vector, with extra 0 at start in case
# the first column value is 1.
STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))
# Count number of 0-1 transitions.
COUNT <- stringr::str_count(STRING, pattern = "01")
# Add this to the summary column.
DATA$Count[n] <- COUNT
}
However, both of these are very slow with my real dataset (3000 - 4000 columns). Any ideas for speeding this up?
Desired output:
> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1
CodePudding user response:
A possible solution, in base R
:
DATA$Count <-
apply(DATA, 1, \(x) x[1] sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 2
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1
CodePudding user response:
Using dplyr
:
DATA %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
V1 V2 V3 V4 V5 count
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 0 1 0 1 0 2
2 0 0 0 1 0 1
3 0 0 0 1 0 1
4 0 0 0 0 0 0
5 1 1 1 1 1 1
6 1 1 0 1 1 2
7 0 0 0 0 0 0
8 1 1 1 1 1 1
9 1 1 1 1 1 1
10 1 1 1 1 1 1
CodePudding user response:
library(data.table)
DATA$Count =
lapply(transpose(DATA), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
# > DATA
# V1 V2 V3 V4 V5 Count
# 1 0 1 0 1 0 2
# 2 0 0 0 1 0 1
# 3 0 0 0 1 0 1
# 4 0 0 0 0 0 0
# 5 1 1 1 1 1 1
# 6 1 1 0 1 1 2
# 7 0 0 0 0 0 0
# 8 1 1 1 1 1 1
# 9 1 1 1 1 1 1
# 10 1 1 1 1 1 1
Benchmarks:
df = data.table::setDF(lapply(seq_len(4000L), \(x) sample(0L:1L, size = 100L, replace=TRUE)))
bench::mark(
sindri = {
lapply(transpose(df), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |>
unlist(use.names = FALSE)
},
tmfmnk = {
df %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
},
Yuriy = {
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) (rowSums(df) == ncol(df))
},
iterations = 1L,
check = FALSE,
relative = TRUE
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 sindri 1 1 113. 1 NaN 1 0 12.77ms <NULL> <Rprofmem [559 x 3]> <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk 113. 113. 1 31.1 Inf 1 1 1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy 35.8 35.8 3.17 1.37 NaN 1 0 456.42ms <NULL> <Rprofmem [8,260 x 3]> <bench_tm [1]> <tibble [1 x 3]>
CodePudding user response:
base
df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
df$Count <-
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1)
(rowSums(df) == ncol(df))
df
#> V1 V2 V3 V4 V5 Count
#> 1 0 1 0 1 0 2
#> 2 0 0 0 1 0 1
#> 3 0 0 0 1 0 1
#> 4 0 0 0 0 0 0
#> 5 1 1 1 1 1 1
#> 6 1 1 0 1 1 1
#> 7 0 0 0 0 0 0
#> 8 1 1 1 1 1 1
#> 9 1 1 1 1 1 1
#> 10 1 1 1 1 1 1
Created on 2022-05-30 by the reprex package (v2.0.1)