How to Properly Split Survey Responses, in R-CodePudding

In survey statistical data, multiple response labels may be recorded in a single column when multiple responses are allowed to a question.

In analyzing such data, you may want to store multiple responses in separate columns, necessitating string splitting.

Run the following code as a sample.

smp <- data.frame(
  x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)
smp
#>            x
#> 1      1,2,3
#> 2      2,5,9
#> 3        1,5
#> 4 2,7,8,9,10

^{Created on 2023-01-21 with reprex v2.0.2}

In this data, each row represents the responses of a different respondent, and the analyst knows how many choices there are in total, but does not know which or how many responses will be selected.

The result of dividing this appropriately should look like this

out <- data.frame(
  d_1 = c(1,NA,1,NA),
  d_2 = c(2,2,NA,2),
  d_3 = c(3,NA,NA,NA),
  d_4 = c(NA,NA,NA,NA),
  d_5 = c(NA,5,5,NA),
  d_6 = c(NA,NA,NA,NA),
  d_7 = c(NA,NA,NA,7),
  d_8 = c(NA,NA,NA,8),
  d_9 = c(NA,9,NA,9),
  d_10 = c(NA,NA,NA,10)
)
out
#>   d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
#> 1   1   2   3  NA  NA  NA  NA  NA  NA   NA
#> 2  NA   2  NA  NA   5  NA  NA  NA   9   NA
#> 3   1  NA  NA  NA   5  NA  NA  NA  NA   NA
#> 4  NA   2  NA  NA  NA  NA   7   8   9   10

^{Created on 2023-01-21 with reprex v2.0.2}

Is there a good way to arrange the data into this form?

CodePudding user response：

This not elegant function can give the expected result to you.

split_survey <- function(data){
  df <- data
  
  val <- df %>% 
    tidyr::separate_rows(x, sep=",", convert = TRUE) %>% 
    range() 
  
  Names <- paste0("d_", seq(val[1], val[2]))
  df <- data.frame(do.call(rbind, list(Names)))
  names(df) <- df[1,]
  df[1:nrow(data), ] <- NA
  
  values <- lapply(strsplit(data$x, ","), function(x) paste0("d_",sub("\\s ", "", x)))

  for(i in seq_len(nrow(df))){
    ind <- names(df) %in%  values[[i]] 
    df[i, ind] <- as.integer(sub("\\D ", "", values[[i]]))
  }
  
  df[] <- lapply(df, as.integer)

  return(df)
}


split_survey(smp)
 d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
1   1   2   3  NA  NA  NA  NA  NA  NA   NA
2  NA   2  NA  NA   5  NA  NA  NA   9   NA
3   1  NA  NA  NA   5  NA  NA  NA  NA   NA
4  NA   2  NA  NA  NA  NA   7   8   9   10
5  NA   2  NA  NA  NA  NA   7   8   9   10

It will work if smp only contains one variable named x as in your example.

CodePudding user response：

A truth table perhaps, in base R

smp <- data.frame(
  x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)
smp_mtx <- matrix(NA, nrow = 4, ncol = 10)

for (i in 1:nrow(smp)) {
smp_mtx[i, which(c(1,2,3,4,5,6,7,8,9,10) %in% as.numeric(unlist(strsplit(smp[i, ], split = ','))) == TRUE)] <- TRUE
}
smp_mtx
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] TRUE TRUE TRUE   NA   NA   NA   NA   NA   NA    NA
[2,]   NA TRUE   NA   NA TRUE   NA   NA   NA TRUE    NA
[3,] TRUE   NA   NA   NA TRUE   NA   NA   NA   NA    NA
[4,]   NA TRUE   NA   NA   NA   NA TRUE TRUE TRUE  TRUE

which( offers a curious, and somewhat mysterious arr.ind = TRUE) that might allow us to walk our values back into the truth table... I've found it easiest to take this a an object first

arr_idx_T <- which(smp_mtx == TRUE, arr.ind = TRUE)
arr_idx_T
      row col
 [1,]   1   1
 [2,]   3   1
 [3,]   1   2
 [4,]   2   2
 [5,]   4   2
 [6,]   1   3
 [7,]   2   5
 [8,]   3   5
 [9,]   4   7
[10,]   4   8
[11,]   2   9
[12,]   4   9
[13,]   4  10
arr_idx_T[, 2][which(arr_idx_T[,1] == 1)]
[1] 1 2 3
arr_idx_T[, 2][which(arr_idx_T[,1] == 2)]
[1] 2 5 9

giving both columns and values, now just the mechanics of plugging them back in, whose notation is causing my brain to presently slip a cog.

CodePudding user response：

Here is tidyverse solution using a potpourri of functions from the tidyverse family:

library(dplyr)
library(readr)
library(tidyr)

smp %>% 
  mutate(id = row_number()) %>% 
  separate_rows(x) %>% 
  type.convert(as.is = TRUE) %>% 
  arrange(x) %>% 
  complete(x = first(x):last(x)) %>% 
  mutate(x = paste0("d_", x)) %>% 
  count(id, x) %>% 
  pivot_wider(names_from = x, values_from = n) %>%
  filter(row_number() <= n()-1) %>% 
  mutate(across(-id, ~case_when(. == 1 ~ readr::parse_number(cur_column())))) %>% 
  select(order(readr::parse_number(names(.))), -id)

    d_1   d_2   d_3   d_4   d_5   d_6   d_7   d_8   d_9  d_10
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     2     3    NA    NA    NA    NA    NA    NA    NA
2    NA     2    NA    NA     5    NA    NA    NA     9    NA
3     1    NA    NA    NA     5    NA    NA    NA    NA    NA
4    NA     2    NA    NA    NA    NA     7     8     9    10

CodePudding user response：

Here is an alternate approach, where I used the join

data

smp <- data.frame(
  x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)

code

library(tidyverse)

dummmy <- data.frame(x=1:10) %>% mutate(x=as.character(x))


df <- tibble(x=strsplit(smp$x,',')) %>% mutate(len=row_number(), value=x) %>% 
  unnest(c(x,value)) %>% 
  full_join(dummmy, by='x') %>% 
  mutate(name=paste0('d_',x), x=as.numeric(x), value=as.numeric(value)) %>% 
  arrange(x) %>%   
  pivot_wider(len, names_from = name, values_from = value) %>% select(-len) %>% 
  mutate(sum=rowSums(across(starts_with('d')),na.rm=T)) %>% 
  filter(sum>0) %>% select(-sum)

^{Created on 2023-01-21 with reprex v2.0.2}

output

# A tibble: 4 × 10
    d_1   d_2   d_3   d_4   d_5   d_6   d_7   d_8   d_9  d_10
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     2     3    NA    NA    NA    NA    NA    NA    NA
2     1    NA    NA    NA     5    NA    NA    NA    NA    NA
3    NA     2    NA    NA     5    NA    NA    NA     9    NA
4    NA     2    NA    NA    NA    NA     7     8     9    10