Create presence/absence variables from character string for long data-CodePudding

Let's say I have a data frame like this:

dat<- data.frame(ID= rep(c("A","B","C","D"),4),
             test= rep(c("pre","post"),8),
             item= c(rep("item1",8),rep("item2",8))
             answer= c("undergraduateeducation_graduateorprofessionalschool_employment", 
                       "graduateorprofessionalschool",
                       "undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
                       "volunteeractivityoroutreach", 
                       "undergraduateeducation_employment_volunteeractivityoroutreach",
                       "employment",
                       "volunteeractivityoroutreach",
                       "undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
                       "undergraduateeducation_graduateorprofessionalschool_employment", 
                       "graduateorprofessionalschool",
                       "undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach",
                       "volunteeractivityoroutreach", 
                       "undergraduateeducation_employment_volunteeractivityoroutreach",
                       "employment",
                       "volunteeractivityoroutreach",
                       "undergraduateeducation_graduateorprofessionalschool_employment_volunteeractivityoroutreach"))

The answer column represents a "select all the apply" answer type- where the underscore separates selected answer options. For each ID, test and item, I would like to change this single variable to multiple presence/absence variables indicating the presence or absence of that answer component in the string. 1 indicates that answer option was present in the respondents answer and 0 represents that component was absent. The variables undergraduate, graduate, employment and volunteer in res correspond to the following strings in answer, respectivley: undergraduateeducation, graduateorprofessionalschool,employment, volunteeractivityoroutreach. White spaces were removed.

The result data frame would look as follows:

res<- data.frame(ID= rep(c("A","B","C","D"),4),
             test= rep(c("pre","post"),8),
             item= c(rep("item1",8),rep("item2",8)),
             undergraduate= c(1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1),
             graduate= c(1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1),
             employment=c(1,0,1,0,1,1,0,1,1,0,1,0,1,1,0,1),
             volunteer=c(0,0,1,1,1,0,1,1,0,0,1,1,1,0,1,1))

CodePudding user response：

In base R you could do:

new_cols <- c('undergraduate', 'graduate', 'employment', 'volunteer')

cbind(dat[1:3],
      as.data.frame(do.call(rbind, lapply(strsplit(dat$answer, "_"), 
      function(x) {
        z <- sapply(new_cols, function(y) as.numeric(grepl(paste0("\\b", y), x)))
        if(is.vector(z)) z else colSums(z)
      }))))
#>    ID test  item undergraduate graduate employment volunteer
#> 1   A  pre item1             1        1          1         0
#> 2   B post item1             0        1          0         0
#> 3   C  pre item1             1        1          1         1
#> 4   D post item1             0        0          0         1
#> 5   A  pre item1             1        0          1         1
#> 6   B post item1             0        0          1         0
#> 7   C  pre item1             0        0          0         1
#> 8   D post item1             1        1          1         1
#> 9   A  pre item2             1        1          1         0
#> 10  B post item2             0        1          0         0
#> 11  C  pre item2             1        1          1         1
#> 12  D post item2             0        0          0         1
#> 13  A  pre item2             1        0          1         1
#> 14  B post item2             0        0          1         0
#> 15  C  pre item2             0        0          0         1
#> 16  D post item2             1        1          1         1

^{Created on 2022-05-05 by the reprex package (v2.0.1)}

CodePudding user response：

One option is to use tidyverse to separate the data into rows on the _, then keep only the keywords (which will be used for column names). Then, we create a value column to note presence, then we can pivot to wide format, and fill the other values with 0.

library(tidyverse)

result <- dat %>%
  mutate(rn = row_number()) %>% 
  separate_rows(answer, sep = "_") %>%
  mutate(answer = str_extract(answer, "undergraduate|graduate|employment|volunteer"),
         value = 1) %>% 
  pivot_wider(names_from = "answer", values_from = "value", values_fill = 0) %>% 
  select(-rn)

Output

   ID    test  item  undergraduate graduate employment volunteer
   <chr> <chr> <chr>         <dbl>    <dbl>      <dbl>     <dbl>
 1 A     pre   item1             1        1          1         0
 2 B     post  item1             0        1          0         0
 3 C     pre   item1             1        1          1         1
 4 D     post  item1             0        0          0         1
 5 A     pre   item1             1        0          1         1
 6 B     post  item1             0        0          1         0
 7 C     pre   item1             0        0          0         1
 8 D     post  item1             1        1          1         1
 9 A     pre   item2             1        1          1         0
10 B     post  item2             0        1          0         0
11 C     pre   item2             1        1          1         1
12 D     post  item2             0        0          0         1
13 A     pre   item2             1        0          1         1
14 B     post  item2             0        0          1         0
15 C     pre   item2             0        0          0         1
16 D     post  item2             1        1          1         1

Test

identical(result, as_tibble(res))

#[1] TRUE