How to count frequencies of n-sequences repeatedly shifted by one?-CodePudding

I have this sequence of numbers:

> Results
 [1] 0 0 0 1 1 1 0 1 0 1 1

In a previous question, I learned how to write a function that counts the number of consecutive "n" sequences (left to right) that occur in a string.

Using the answer provided in the previous question - I tried this function on my data:

n_sequences <- function(n, results) {
  helper <- function(i, n) if (n < 1) "" else sprintf(
    "%s%s", 
    helper(i, n - 1), 
    results[i   n - 1]
  )
  result <- data.frame(
    table(
      sapply(
        1:(length(results) - n),
        function(i) helper(i, n)
      )
    )
  )
  colnames(result) <- c("Sequence", "Frequency")
  result
}

# consecutive 4 sequence
n_sequences(4, Results)

This provided me with the following answer:

  Sequence Frequency
1     0001         1
2     0011         1
3     0101         1
4     0111         1
5     1010         1
6     1101         1
7     1110         1

All appears to be good - but looking more closely:

> Results
 [1] 0 0 0 1 1 1 0 1 0 1 1

I think the above table is missing the last combination 1,0,1,1

Does anyone know how the above function can be modified to include this last sequence?

Note: In the future, I am interested in using this table to calculate conditional probabilities. For example, given that three 0's appeared, what is the probability that the next number is 1?

Note: I am always interested in learning about alternate ways to write functions that can accomplish the same task - if anyone else has an alternate method to calculate these frequency counts, I would be interested!

CodePudding user response：

Here is an option using a data.table grouping operation.

library(data.table)

Results <- c(0,0,0,1,1,1,0,1,0,1,1)

n_sequences <- function(n, results) {
  as.data.table(
    matrix(
      results[sequence(rep(length(results) - n   1, n), 1:n)],
      ncol = n
    )
  )[
    , .(sequence = paste0(.BY, collapse = ""), Frequency = .N), by = eval(paste0("V", 1:n))
  ][
    ,paste0("V", 1:n) := NULL
  ]
}

n_sequences(4, Results)[]
#>    sequence Frequency
#> 1:     0001         1
#> 2:     0011         1
#> 3:     0111         1
#> 4:     1110         1
#> 5:     1101         1
#> 6:     1010         1
#> 7:     0101         1
#> 8:     1011         1

Its performance is very favorable compared to a vapply approach.

f <- function(x, n, ordered=TRUE) {
  len <- length(x)
  stopifnot(!anyNA(x) && n > 0L && n <= len)
  v <- paste(x, collapse='')
  sq <- vapply(0:(len - n), function(i) substr(v, 1L   i, n   i), character(1L))
  out <- as.data.frame(table(sq))
  if (!ordered) `rownames<-`(out[match(out$sq, unique(sq)), ], NULL) else out
}

Results <- sample(0:1, 1e7, 1)
system.time(Frequency1 <- f(Results, 6))
#>    user  system elapsed 
#>   15.30    0.17   15.49
system.time(Frequency2 <- n_sequences(6, Results))
#>    user  system elapsed 
#>    1.03    0.50    0.80
identical(Frequency1$Freq, setorder(Frequency2, sequence)$Frequency)
#> [1] TRUE

CodePudding user response：

We may paste the vector to a string and use substr from 1 i to n i incrementing i from 0 to len - n using vapply to get the sequences that can easily be tabled.

f <- \(x, n, ordered=TRUE) {
  len <- length(x)
  stopifnot(!anyNA(x) && n > 0L && n <= len)
  v <- paste(x, collapse='')
  sq <- vapply(0:(len - n), \(i) substr(v, 1L   i, n   i), character(1L))
  out <- as.data.frame(table(sq))
  if (!ordered) `rownames<-`(out[match(out$sq, unique(sq)), ], NULL) else out
}

x <- c(0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1)

f(x, 4)
#     sq Freq
# 1 0001    1
# 2 0011    1
# 3 0101    1
# 4 0111    1
# 5 1010    1
# 6 1011    1
# 7 1101    1
# 8 1110    1

If we want order of appearance rather than alphabetical order:

f(x, 4, ordered=FALSE)
#     sq Freq
# 1 0001    1
# 2 0011    1
# 3 1101    1
# 4 0101    1
# 5 1011    1
# 6 1110    1
# 7 1010    1
# 8 0111    1

This works pretty fast,

set.seed(42)
system.time(res <- f(sample(0:1, 1e3, replace=TRUE), 5))
#  user  system elapsed 
# 0.005   0.001   0.004 
res
#       sq Freq
# 1  00000   18
# 2  00001   26
# 3  00010   41
# ...
# 30 11101   32
# 31 11110   34
# 32 11111   20

even for relatively large vectors.

f(sample(0:1, 1e7, replace=TRUE), 6) |> system.time()
#   user  system elapsed 
# 34.668   0.000  34.643