Incrementing grouped identifiers-CodePudding

I have example data as follows:

library(data.table)
dat <- fread("Survey Variable_codes_2022
              D       D1
              A       A1
              B       B1
              B       B3
              B       B2
              E       E1
              B       NA
              E       NA")

For the two rows that have Variable_codes_2022==NA, I would like to increment the variable code so that it becomes:

dat <- fread("Survey Variable_codes_2022
              D       D1
              A       A1
              B       B1
              B       B3
              B       B2
              E       E1
              B       B4
              E       E2"

Because the column Variable_codes_2022 is a string variable, the numbers are not in numerical order.

I have no idea where to start and I was wondering if someone could help me on the right track.

CodePudding user response：

We could do it this way:

grouping
arranging and
mutate.

To keep the original order we could first create and id and then rearrange:

library(dplyr)

dat %>% 
  group_by(Survey) %>% 
  arrange(.by_group = TRUE) %>% 
  mutate(Variable_codes_2022 = paste0(Survey, row_number()))

  Survey Variable_codes_2022
  <chr>  <chr>              
1 A      A1                 
2 B      B1                 
3 B      B2                 
4 B      B3                 
5 B      B4                 
6 D      D1                 
7 E      E1                 
8 E      E2

CodePudding user response：

dat <- 
structure(list(survey = c("D", "A", "B", "B", "B", "E", "B", 
"E", "B"), var_code = c("D1", "A1", "B1", "B3", "B2", "E1", NA, 
NA, NA)), row.names = c(NA, -9L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000026db10f1ef0>)

library(dplyr)
library(stringr)

dat %>% 
  group_by(survey) %>% 
  mutate(
    aux1 = as.numeric(stringr::str_remove(var_code,survey)),
    aux2 = cumsum(is.na(var_code)),
    var_code = paste0(survey,max(aux1,na.rm = TRUE) aux2)
    ) %>% 
  ungroup() %>% 
  select(-aux1,-aux2)

# A tibble: 9 x 2
  survey var_code
  <chr>  <chr>   
1 D      D1      
2 A      A1      
3 B      B3      
4 B      B3      
5 B      B3      
6 E      E1      
7 B      B4      
8 E      E2      
9 B      B5

CodePudding user response：

This solution with rowid. Added an extra element to the sample so it can be tested against multiple missings

library(data.table)
#> Warning: package 'data.table' was built under R version 4.2.2
dat <- fread("Survey Variable_codes_2022
              D       D1
              A       A1
              B       B1
              B       B3
              B       B2
              E       E1
              B       NA
              E       NA
              E       NA")

dat[, n := as.numeric(substr(
  Variable_codes_2022, nchar(Survey) 1, nchar(Variable_codes_2022)))]

dat[is.na(n),
  Variable_codes_2022 := paste0(Survey, rowid(Survey)   
    dat[.SD[,.(Survey)], .(m=max(n, na.rm=T)), on = "Survey", by=.EACHI ][,m])]

dat                                                                      
#>    Survey Variable_codes_2022  n
#> 1:      D                  D1  1
#> 2:      A                  A1  1
#> 3:      B                  B1  1
#> 4:      B                  B3  3
#> 5:      B                  B2  2
#> 6:      E                  E1  1
#> 7:      B                  B4 NA
#> 8:      E                  E2 NA
#> 9:      E                  E3 NA

CodePudding user response：

data.table option using rleid like this:

library(data.table)
dat[, Variable_codes_2022 := paste0(Survey, rleid(Variable_codes_2022)), by = Survey]
dat
#>    Survey Variable_codes_2022
#> 1:      D                  D1
#> 2:      A                  A1
#> 3:      B                  B1
#> 4:      B                  B2
#> 5:      B                  B3
#> 6:      E                  E1
#> 7:      B                  B4
#> 8:      E                  E2

^{Created on 2022-12-01 with reprex v2.0.2}