Add a column with an ID for matching rows-CodePudding

I have data as follows:

library(data.table)
dat <- fread("Variable_codes_2022 Variables_2022
              Cat1_1              This_question
              Cat1_2              Other_question
              Cat2_1              One_question
              Cat2_2              Another_question
              Cat3_1              Some_question
              Cat3_2              Extra_question
              Cat3_3              This_question
              Cat4_1              One_question
              Cat4_2              Wrong_question")

What I would like to do, is to create a new column, that provides a unique new variable code, for matching variables. I started with creating a column that shows the duplicates, but this only gives TRUE for the second occurrence and not both. In addition, I then still have to give the TRUE values unique names.

dat$Common_codes_2022 <-  duplicated(dat[,2])

How should I do this?

Desired output:

   Variable_codes_2022   Variables_2022 Common_codes_2022
1:              Cat1_1    This_question Com_1
2:              Cat1_2   Other_question
3:              Cat2_1     One_question Com_2
4:              Cat2_2 Another_question
5:              Cat3_1    Some_question
6:              Cat3_2   Extra_question
7:              Cat3_3    This_question Com_1
8:              Cat4_1     One_question Com_2
9:              Cat4_2   Wrong_question

CodePudding user response：

You can find the duplicated, and make a factor out of it, with the dupes as levels= and customized labels=.

(lv <- with(dat, Variables_2022[duplicated(Variables_2022)]))
# [1] "This_question" "One_question" 

names(lv) <- paste0('Com_', seq_along(lv))

In data.table:

dat[,Common_codes_2022 := factor(Variables_2022, levels=lv, labels=names(lv))]
dat
#    Variable_codes_2022   Variables_2022 Common_codes_2022
# 1:              Cat1_1    This_question             Com_1
# 2:              Cat1_2   Other_question              <NA>
# 3:              Cat2_1     One_question             Com_2
# 4:              Cat2_2 Another_question              <NA>
# 5:              Cat3_1    Some_question              <NA>
# 6:              Cat3_2   Extra_question              <NA>
# 7:              Cat3_3    This_question             Com_1
# 8:              Cat4_1     One_question             Com_2
# 9:              Cat4_2   Wrong_question              <NA>

or base R:

transform(dat, Common_codes_2022=factor(Variables_2022, levels=lv, labels=names(lv)))
#    Variable_codes_2022   Variables_2022 Common_codes_2022
# 1:              Cat1_1    This_question             Com_1
# 2:              Cat1_2   Other_question              <NA>
# 3:              Cat2_1     One_question             Com_2
# 4:              Cat2_2 Another_question              <NA>
# 5:              Cat3_1    Some_question              <NA>
# 6:              Cat3_2   Extra_question              <NA>
# 7:              Cat3_3    This_question             Com_1
# 8:              Cat4_1     One_question             Com_2
# 9:              Cat4_2   Wrong_question              <NA>

CodePudding user response：

If the common codes variable is just labeling specific Variables_2022 responses, you can use conditionals such as this case_when() example.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last

dat <- fread("Variable_codes_2022 Variables_2022
              Cat1_1              This_question
              Cat1_2              Other_question
              Cat2_1              One_question
              Cat2_2              Another_question
              Cat3_1              Some_question
              Cat3_2              Extra_question
              Cat3_3              This_question
              Cat4_1              One_question
              Cat4_2              Wrong_question")

dat %>%
  mutate(Common_codes_2022 = case_when(
    Variables_2022 == "This_question" ~ "Com_1",
    Variables_2022 == "One_question" ~ "Com_2",
    TRUE ~ "")
  )
#>    Variable_codes_2022   Variables_2022 Common_codes_2022
#> 1:              Cat1_1    This_question             Com_1
#> 2:              Cat1_2   Other_question                  
#> 3:              Cat2_1     One_question             Com_2
#> 4:              Cat2_2 Another_question                  
#> 5:              Cat3_1    Some_question                  
#> 6:              Cat3_2   Extra_question                  
#> 7:              Cat3_3    This_question             Com_1
#> 8:              Cat4_1     One_question             Com_2
#> 9:              Cat4_2   Wrong_question

^{Created on 2022-11-12 with reprex v2.0.2}

CodePudding user response：

Using dplyr:

library(data.table)
library(dplyr)

dat <- fread("Variable_codes_2022 Variables_2022
              Cat1_1              This_question
              Cat1_2              Other_question
              Cat2_1              One_question
              Cat2_2              Another_question
              Cat3_1              Some_question
              Cat3_2              Extra_question
              Cat3_3              This_question
              Cat4_1              One_question
              Cat4_2              Wrong_question")

dat_split <- split(
  dat,
  duplicated(dat$Variables_2022) |
    duplicated(dat$Variables_2022, fromLast = TRUE)
)

dat_split$`TRUE` %>%
  group_by(Variables_2022) %>%
  mutate(Common_codes_2022 = paste0("Com_", cur_group_id())) %>%
  bind_rows(dat_split$`FALSE`) %>%
  arrange(Variable_codes_2022)

But if you don't care for the names of Common_codes_2022 as long as they are unique you can use:

library(data.table)
library(dplyr)

dat <- fread("Variable_codes_2022 Variables_2022
              Cat1_1              This_question
              Cat1_2              Other_question
              Cat2_1              One_question
              Cat2_2              Another_question
              Cat3_1              Some_question
              Cat3_2              Extra_question
              Cat3_3              This_question
              Cat4_1              One_question
              Cat4_2              Wrong_question")

dat %>%
  group_by(Variables_2022) %>%
  mutate(
    Common_codes_2022 = if_else(
      n() > 1,
      paste0("Com_", cur_group_id()),
      NA_character_
    )
  )

CodePudding user response：

Update version:

library(dplyr)

dat %>% 
  mutate(id = row_number()) %>% 
  mutate(Common_codes_2022 = duplicated(Variables_2022) | duplicated(Variables_2022, fromLast=TRUE)) %>% 
  arrange(-Common_codes_2022) %>% 
  mutate(Common_codes_2022 = ifelse(Common_codes_2022==TRUE,
                                    paste("Com", ifelse(Common_codes_2022==TRUE, c(1,2), Common_codes_2022), sep = "_"), NA)) %>% 
  arrange(id) %>% 
  select(-id)

   Variable_codes_2022   Variables_2022 Common_codes_2022
1:              Cat1_1    This_question             Com_1
2:              Cat1_2   Other_question              <NA>
3:              Cat2_1     One_question             Com_2
4:              Cat2_2 Another_question              <NA>
5:              Cat3_1    Some_question              <NA>
6:              Cat3_2   Extra_question              <NA>
7:              Cat3_3    This_question             Com_1
8:              Cat4_1     One_question             Com_2
9:              Cat4_2   Wrong_question              <NA>