I have data as follows:
library(data.table)
dat <- fread("Variable_codes_2022 Variables_2022
Cat1_1 This_question
Cat1_2 Other_question
Cat2_1 One_question
Cat2_2 Another_question
Cat3_1 Some_question
Cat3_2 Extra_question
Cat3_3 This_question
Cat4_1 One_question
Cat4_2 Wrong_question")
What I would like to do, is to create a new column, that provides a unique new variable code, for matching variables. I started with creating a column that shows the duplicates, but this only gives TRUE
for the second occurrence and not both. In addition, I then still have to give the TRUE
values unique names.
dat$Common_codes_2022 <- duplicated(dat[,2])
How should I do this?
Desired output:
Variable_codes_2022 Variables_2022 Common_codes_2022
1: Cat1_1 This_question Com_1
2: Cat1_2 Other_question
3: Cat2_1 One_question Com_2
4: Cat2_2 Another_question
5: Cat3_1 Some_question
6: Cat3_2 Extra_question
7: Cat3_3 This_question Com_1
8: Cat4_1 One_question Com_2
9: Cat4_2 Wrong_question
CodePudding user response:
You can find the duplicated
, and make a factor
out of it, with the dupes as levels=
and customized labels=
.
(lv <- with(dat, Variables_2022[duplicated(Variables_2022)]))
# [1] "This_question" "One_question"
names(lv) <- paste0('Com_', seq_along(lv))
In data.table
:
dat[,Common_codes_2022 := factor(Variables_2022, levels=lv, labels=names(lv))]
dat
# Variable_codes_2022 Variables_2022 Common_codes_2022
# 1: Cat1_1 This_question Com_1
# 2: Cat1_2 Other_question <NA>
# 3: Cat2_1 One_question Com_2
# 4: Cat2_2 Another_question <NA>
# 5: Cat3_1 Some_question <NA>
# 6: Cat3_2 Extra_question <NA>
# 7: Cat3_3 This_question Com_1
# 8: Cat4_1 One_question Com_2
# 9: Cat4_2 Wrong_question <NA>
or base R:
transform(dat, Common_codes_2022=factor(Variables_2022, levels=lv, labels=names(lv)))
# Variable_codes_2022 Variables_2022 Common_codes_2022
# 1: Cat1_1 This_question Com_1
# 2: Cat1_2 Other_question <NA>
# 3: Cat2_1 One_question Com_2
# 4: Cat2_2 Another_question <NA>
# 5: Cat3_1 Some_question <NA>
# 6: Cat3_2 Extra_question <NA>
# 7: Cat3_3 This_question Com_1
# 8: Cat4_1 One_question Com_2
# 9: Cat4_2 Wrong_question <NA>
CodePudding user response:
If the common codes variable is just labeling specific Variables_2022
responses, you can use conditionals such as this case_when()
example.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
dat <- fread("Variable_codes_2022 Variables_2022
Cat1_1 This_question
Cat1_2 Other_question
Cat2_1 One_question
Cat2_2 Another_question
Cat3_1 Some_question
Cat3_2 Extra_question
Cat3_3 This_question
Cat4_1 One_question
Cat4_2 Wrong_question")
dat %>%
mutate(Common_codes_2022 = case_when(
Variables_2022 == "This_question" ~ "Com_1",
Variables_2022 == "One_question" ~ "Com_2",
TRUE ~ "")
)
#> Variable_codes_2022 Variables_2022 Common_codes_2022
#> 1: Cat1_1 This_question Com_1
#> 2: Cat1_2 Other_question
#> 3: Cat2_1 One_question Com_2
#> 4: Cat2_2 Another_question
#> 5: Cat3_1 Some_question
#> 6: Cat3_2 Extra_question
#> 7: Cat3_3 This_question Com_1
#> 8: Cat4_1 One_question Com_2
#> 9: Cat4_2 Wrong_question
Created on 2022-11-12 with reprex v2.0.2
CodePudding user response:
Using dplyr
:
library(data.table)
library(dplyr)
dat <- fread("Variable_codes_2022 Variables_2022
Cat1_1 This_question
Cat1_2 Other_question
Cat2_1 One_question
Cat2_2 Another_question
Cat3_1 Some_question
Cat3_2 Extra_question
Cat3_3 This_question
Cat4_1 One_question
Cat4_2 Wrong_question")
dat_split <- split(
dat,
duplicated(dat$Variables_2022) |
duplicated(dat$Variables_2022, fromLast = TRUE)
)
dat_split$`TRUE` %>%
group_by(Variables_2022) %>%
mutate(Common_codes_2022 = paste0("Com_", cur_group_id())) %>%
bind_rows(dat_split$`FALSE`) %>%
arrange(Variable_codes_2022)
But if you don't care for the names of Common_codes_2022
as long as they are unique you can use:
library(data.table)
library(dplyr)
dat <- fread("Variable_codes_2022 Variables_2022
Cat1_1 This_question
Cat1_2 Other_question
Cat2_1 One_question
Cat2_2 Another_question
Cat3_1 Some_question
Cat3_2 Extra_question
Cat3_3 This_question
Cat4_1 One_question
Cat4_2 Wrong_question")
dat %>%
group_by(Variables_2022) %>%
mutate(
Common_codes_2022 = if_else(
n() > 1,
paste0("Com_", cur_group_id()),
NA_character_
)
)
CodePudding user response:
Update version:
library(dplyr)
dat %>%
mutate(id = row_number()) %>%
mutate(Common_codes_2022 = duplicated(Variables_2022) | duplicated(Variables_2022, fromLast=TRUE)) %>%
arrange(-Common_codes_2022) %>%
mutate(Common_codes_2022 = ifelse(Common_codes_2022==TRUE,
paste("Com", ifelse(Common_codes_2022==TRUE, c(1,2), Common_codes_2022), sep = "_"), NA)) %>%
arrange(id) %>%
select(-id)
Variable_codes_2022 Variables_2022 Common_codes_2022
1: Cat1_1 This_question Com_1
2: Cat1_2 Other_question <NA>
3: Cat2_1 One_question Com_2
4: Cat2_2 Another_question <NA>
5: Cat3_1 Some_question <NA>
6: Cat3_2 Extra_question <NA>
7: Cat3_3 This_question Com_1
8: Cat4_1 One_question Com_2
9: Cat4_2 Wrong_question <NA>