Detect index based on a condition-CodePudding

I have a data frame with blocks of values with 0 and 1 and NAs, for example:

mydata <- data.frame(a = c(0,0,0,1,1,1,0,0,0,1,1,NA,NA,NA), b = c(0,0,1,1,1,1,0,0,1,1,0,NA,NA,NA))

what I want is to obtain, for each variable, the index of start and end of each block 1, this will the desired result:

mydata <- data.frame(a = c(4,6,10,11), b = c(3,6,9,10))

How can I code it?

CodePudding user response：

You may try

apply(mydata, 2, function(x){
  y <- rle(x == 1)
  z <- c(cumsum(y$lengths)[which(y$values)], cumsum(y$lengths)[which(y$values) - 1]   1)
  return(sort(z))
})

      a  b
[1,]  4  3
[2,]  6  6
[3,] 10  9
[4,] 11 10

CodePudding user response：

Since you stated that you only have 0,1,NA you could also use str_locate:

library(tidyverse)
map_df(mydata, ~c(t(str_locate_all(paste(., collapse = ''), '1 ')[[1]])))

# A tibble: 4 x 2
      a     b
  <int> <int>
1     4     3
2     6     6
3    10     9
4    11    10

You could also arrange it in start end format:

 map_df(mydata, ~as_tibble(str_locate_all(paste(., collapse = ''), '1 ')[[1]]), .id='grp')
# A tibble: 4 x 3
  grp   start   end
  <chr> <int> <int>
1 a         4     6
2 a        10    11
3 b         3     6
4 b         9    10

CodePudding user response：

We can try diff cumsum to generate grouping info and then use range to get the range of block

list2DF(
    lapply(
        mydata,
        function(x) {
            unlist(
                by(
                    v <- which(x == 1),
                    cumsum(c(0, diff(v) != 1)),
                    range
                )
            )
        }
    )
)

which gives

Another option is using aggregate

aggregate(
    . ~ col,
    data.frame(
        which(mydata == 1, arr.ind = TRUE)
    ),
    function(v) {
        by(
            v,
            cumsum(c(0, diff(v) != 1)),
            range
        )
    },
    simplify = FALSE
)

which gives

  col          row
1   1 4, 6, 10, 11
2   2  3, 6, 9, 10