Home > Enterprise >  R count number of pattern matches in consecutive columns
R count number of pattern matches in consecutive columns

Time:05-31

I have a dataframe of '0's and '1's, like so:

DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
                   "V2" = c(1,0,0,0,1,1,0,1,1,1),
                   "V3" = c(0,0,0,0,1,0,0,1,1,1),
                   "V4" = c(1,1,1,0,1,1,0,1,1,1),
                   "V5" = c(0,0,0,0,1,1,0,1,1,1))

I want to know how many times in each row a '0' is followed by a '1' in the next column. If the first column value is a '1', this should also be counted.

I have a loop which binds each row into a vector and then counts the number of '01's using either stringi::stri_count_fixed or stringr::str_count:

  for(n in 1:nrow(DATA)) {
    # Paste row into a single character vector, with extra 0 at start in case
    # the first column value is 1.
    STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))

    # Count number of 0-1 transitions.
    COUNT <- stringr::str_count(STRING, pattern = "01")

    # Add this to the summary column.
    DATA$Count[n] <- COUNT
  }

However, both of these are very slow with my real dataset (3000 - 4000 columns). Any ideas for speeding this up?

Desired output:

> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1

CodePudding user response:

A possible solution, in base R:

DATA$Count <- 
  apply(DATA, 1, \(x) x[1]   sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA

#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     2
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1

CodePudding user response:

Using dplyr:

DATA %>%
    rowwise() %>%
    mutate(count = sum(diff(c(0, c_across(everything()))) == 1))

     V1    V2    V3    V4    V5 count
   <dbl> <dbl> <dbl> <dbl> <dbl> <int>
 1     0     1     0     1     0     2
 2     0     0     0     1     0     1
 3     0     0     0     1     0     1
 4     0     0     0     0     0     0
 5     1     1     1     1     1     1
 6     1     1     0     1     1     2
 7     0     0     0     0     0     0
 8     1     1     1     1     1     1
 9     1     1     1     1     1     1
10     1     1     1     1     1     1

CodePudding user response:

library(data.table)

DATA$Count = 
  lapply(transpose(DATA), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
  unlist(use.names = FALSE)

# > DATA
#    V1 V2 V3 V4 V5 Count
# 1   0  1  0  1  0     2
# 2   0  0  0  1  0     1
# 3   0  0  0  1  0     1
# 4   0  0  0  0  0     0
# 5   1  1  1  1  1     1
# 6   1  1  0  1  1     2
# 7   0  0  0  0  0     0
# 8   1  1  1  1  1     1
# 9   1  1  1  1  1     1
# 10  1  1  1  1  1     1

Benchmarks:

df = data.table::setDF(lapply(seq_len(4000L), \(x) sample(0L:1L, size = 100L, replace=TRUE)))


bench::mark(
  sindri = {
    lapply(transpose(df), \(x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
      unlist(use.names = FALSE)
  },
  tmfmnk = {
    df %>%
      rowwise() %>%
      mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
  },
  Yuriy = {
    rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1)   (rowSums(df) == ncol(df))
  },
  iterations = 1L,
  check = FALSE,
  relative = TRUE
)



  expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time           gc              
  <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>         <list>          
1 sindri       1      1      113.        1         NaN     1     0    12.77ms <NULL> <Rprofmem [559 x 3]>    <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk     113.   113.       1        31.1       Inf     1     1      1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy       35.8   35.8      3.17      1.37      NaN     1     0   456.42ms <NULL> <Rprofmem [8,260 x 3]>  <bench_tm [1]> <tibble [1 x 3]>

CodePudding user response:

base

df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
                 "V2" = c(1,0,0,0,1,1,0,1,1,1),
                 "V3" = c(0,0,0,0,1,0,0,1,1,1),
                 "V4" = c(1,1,1,0,1,1,0,1,1,1),
                 "V5" = c(0,0,0,0,1,1,0,1,1,1))

df$Count <-
  rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1)   
  (rowSums(df) == ncol(df))


df
#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     1
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1

Created on 2022-05-30 by the reprex package (v2.0.1)

  • Related