Generate a unique alphanumeric string based on a previous value using R-CodePudding

I have a column which is partially populated with unique alphanumeric IDs that all start with 'a' and end with 'b', but numbers in between the two are unique and the total length of the string has to be 6 e.g. a4338b, a4339b (...)

I'm looking for a function in R whereby I can fill in the blank values in ascending order based on the last (and highest) identifier.

Could someone provide some advice on how to do so?

Many thanks.

Sample dataframe:

df <- data.frame(
  id = c("a4338b", "a4339b", "a4340b", NA, NA),
  post = c("teacher", "postal worker", "chef", "waitress", "presenter"))

Expected dataframe:

df <- data.frame(
  id = c("a4338b", "a4339b", "a4340b", "a4341b", "a4342b"),
  post = c("teacher", "postal worker", "chef", "waitress", "presenter"))

CodePudding user response：

If you don't want to change the order of your data.frame:

DF <- data.frame(
  id = c("a4338b", "a4339b", "a4340b", NA, NA),
  post = c("teacher", "postal worker", "chef", "waitress", "presenter"))

minID <- min(as.integer(gsub(pattern = "[a-zA-Z]", "", DF$id)), na.rm = TRUE)
DF$new_id <- paste0("a", minID   seq(to = nrow(DF)-grep(minID, DF$id), length.out = nrow(DF)), "b")

> DF
      id          post new_id
1 a4338b       teacher a4338b
2 a4339b postal worker a4339b
3 a4340b          chef a4340b
4   <NA>      waitress a4341b
5   <NA>     presenter a4342b

This also works with e.g. the following:

DF <- data.frame(
  id = c(NA, "a4338b", "a4339b", "a4340b", NA, NA),
  post = c("roofer", "teacher", "postal worker", "chef", "waitress", "presenter"))

minID <- min(as.integer(gsub(pattern = "[a-zA-Z]", "", DF$id)), na.rm = TRUE)
DF$new_id <- paste0("a", minID   seq(to = nrow(DF)-grep(minID, DF$id), length.out = nrow(DF)), "b")

DF

> DF
      id          post new_id
1   <NA>        roofer a4337b
2 a4338b       teacher a4338b
3 a4339b postal worker a4339b
4 a4340b          chef a4340b
5   <NA>      waitress a4341b
6   <NA>     presenter a4342b

CodePudding user response：

Here's a dplyrsolution:

df %>%
   mutate(
     # extract numbers and convert to numeric:
     id_0 = as.numeric(sub(".(\\d ).", "\\1", id)),
     # replace NA with sequential number:
     id_0 = seq(from = first(id_0), to = first(id_0)   length(id_0) -1),
     # wrap numbers in a-b sandwich:
     id = paste0("a", id_0, "b")) %>%
  # deslect helper column:
  select(-id_0)
      id          post
1 a4338b       teacher
2 a4339b postal worker
3 a4340b          chef
4 a4341b      waitress
5 a4342b     presenter

CodePudding user response：

Convert to integer, then add 1:(number of NAs) to max ID number.

#order so that NAs come last
df <- df[ order(is.na(df$id)), ]

#convert to number
df$x <- as.integer(gsub("[ab]", "", df$id))

#if na then add  1
df$id[ is.na(df$id) ] <- paste0("a", max(df$x, na.rm = TRUE)   seq(sum(is.na(df$id))), "b")

#remove helper column
df$x <- NULL

df
#       id          post
# 1 a4338b       teacher
# 2 a4339b postal worker
# 3 a4340b          chef
# 4 a4341b      waitress
# 5 a4342b     presenter