Update a grouping variable with continuous counts-CodePudding

I have a grouping variable Seq that needs updating:

df
   Line Seq      Q
21  649   9 q_misc
22  650   9   <NA>
23  651   9   <NA>
24  670  10 q_misc
25  682   0 q_misc
26  898   0 q_misc
27  899   0   <NA>
28  900   0   <NA>
29  901   0   <NA>
30 1009  15 q_misc
31 1010  15   <NA>
32 1011  15   <NA>
33 1283  22 q_misc
34 1467   0 q_misc
35 1468   0   <NA>
36 1649   0 q_misc
37 1650   0   <NA>
38 1651   0   <NA>
39 1652   0   <NA>
40 1653   0   <NA>

I need Seq to start from 1 and to assign the next consecutive number when there is a new q_misc token in column Q.

Desired output:

df
   Line Seq      Q
21  649   1 q_misc
22  650   1   <NA>
23  651   1   <NA>
24  670   2 q_misc
25  682   3 q_misc
26  898   4 q_misc
27  899   4   <NA>
28  900   4   <NA>
29  901   4   <NA>
30 1009   5 q_misc
31 1010   5   <NA>
32 1011   5   <NA>
33 1283   6 q_misc
34 1467   7 q_misc
35 1468   7   <NA>
36 1649   8 q_misc
37 1650   8   <NA>
38 1651   8   <NA>
39 1652   8   <NA>
40 1653   8   <NA>

I'd be grateful for pointers how that can be done.

Reproducible data:

df <- structure(list(Line = c(649L, 650L, 651L, 670L, 682L, 898L, 899L, 
                              900L, 901L, 1009L, 1010L, 1011L, 1283L, 1467L, 1468L, 1649L, 
                              1650L, 1651L, 1652L, 1653L), Seq = c(9L, 9L, 9L, 10L, 0L, 0L, 
                                                                   0L, 0L, 0L, 15L, 15L, 15L, 22L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
                     Q = c("q_misc", NA, NA, "q_misc", "q_misc", "q_misc", NA, 
                           NA, NA, "q_misc", NA, NA, "q_misc", "q_misc", NA, "q_misc", 
                           NA, NA, NA, NA)), row.names = 21:40, class = "data.frame")

CodePudding user response：

You may use cumsum after replacing NA with empty values (or any value other than 'q_misc').

transform(df, Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))

#   Line Seq      Q
#21  649   1 q_misc
#22  650   1   <NA>
#23  651   1   <NA>
#24  670   2 q_misc
#25  682   3 q_misc
#26  898   4 q_misc
#27  899   4   <NA>
#28  900   4   <NA>
#29  901   4   <NA>
#30 1009   5 q_misc
#31 1010   5   <NA>
#32 1011   5   <NA>
#33 1283   6 q_misc
#34 1467   7 q_misc
#35 1468   7   <NA>
#36 1649   8 q_misc
#37 1650   8   <NA>
#38 1651   8   <NA>
#39 1652   8   <NA>
#40 1653   8   <NA>

In dplyr

library(dplyr)
df %>% mutate(Seq = cumsum(replace(Q, is.na(Q), '') == 'q_misc'))

If the Q column has only NA and 'q_misc' values you can also use.

df %>% mutate(Seq = cumsum(!is.na(Q)))

CodePudding user response：

Using %in%

library(dplyr)
library(tidyr)
df %>%
     mutate(Seq = cumsum(Q %in% 'q_misc'))

-output

Line Seq      Q
21  649   1 q_misc
22  650   1   <NA>
23  651   1   <NA>
24  670   2 q_misc
25  682   3 q_misc
26  898   4 q_misc
27  899   4   <NA>
28  900   4   <NA>
29  901   4   <NA>
30 1009   5 q_misc
31 1010   5   <NA>
32 1011   5   <NA>
33 1283   6 q_misc
34 1467   7 q_misc
35 1468   7   <NA>
36 1649   8 q_misc
37 1650   8   <NA>
38 1651   8   <NA>
39 1652   8   <NA>
40 1653   8   <NA>