In survey statistical data, multiple response labels may be recorded in a single column when multiple responses are allowed to a question.
In analyzing such data, you may want to store multiple responses in separate columns, necessitating string splitting.
Run the following code as a sample.
smp <- data.frame(
x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)
smp
#> x
#> 1 1,2,3
#> 2 2,5,9
#> 3 1,5
#> 4 2,7,8,9,10
Created on 2023-01-21 with reprex v2.0.2
In this data, each row represents the responses of a different respondent, and the analyst knows how many choices there are in total, but does not know which or how many responses will be selected.
The result of dividing this appropriately should look like this
out <- data.frame(
d_1 = c(1,NA,1,NA),
d_2 = c(2,2,NA,2),
d_3 = c(3,NA,NA,NA),
d_4 = c(NA,NA,NA,NA),
d_5 = c(NA,5,5,NA),
d_6 = c(NA,NA,NA,NA),
d_7 = c(NA,NA,NA,7),
d_8 = c(NA,NA,NA,8),
d_9 = c(NA,9,NA,9),
d_10 = c(NA,NA,NA,10)
)
out
#> d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
#> 1 1 2 3 NA NA NA NA NA NA NA
#> 2 NA 2 NA NA 5 NA NA NA 9 NA
#> 3 1 NA NA NA 5 NA NA NA NA NA
#> 4 NA 2 NA NA NA NA 7 8 9 10
Created on 2023-01-21 with reprex v2.0.2
Is there a good way to arrange the data into this form?
CodePudding user response:
This not elegant function can give the expected result to you.
split_survey <- function(data){
df <- data
val <- df %>%
tidyr::separate_rows(x, sep=",", convert = TRUE) %>%
range()
Names <- paste0("d_", seq(val[1], val[2]))
df <- data.frame(do.call(rbind, list(Names)))
names(df) <- df[1,]
df[1:nrow(data), ] <- NA
values <- lapply(strsplit(data$x, ","), function(x) paste0("d_",sub("\\s ", "", x)))
for(i in seq_len(nrow(df))){
ind <- names(df) %in% values[[i]]
df[i, ind] <- as.integer(sub("\\D ", "", values[[i]]))
}
df[] <- lapply(df, as.integer)
return(df)
}
split_survey(smp)
d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
1 1 2 3 NA NA NA NA NA NA NA
2 NA 2 NA NA 5 NA NA NA 9 NA
3 1 NA NA NA 5 NA NA NA NA NA
4 NA 2 NA NA NA NA 7 8 9 10
5 NA 2 NA NA NA NA 7 8 9 10
It will work if smp
only contains one variable named x
as in your example.
CodePudding user response:
A truth table perhaps, in base R
smp <- data.frame(
x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)
smp_mtx <- matrix(NA, nrow = 4, ncol = 10)
for (i in 1:nrow(smp)) {
smp_mtx[i, which(c(1,2,3,4,5,6,7,8,9,10) %in% as.numeric(unlist(strsplit(smp[i, ], split = ','))) == TRUE)] <- TRUE
}
smp_mtx
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] TRUE TRUE TRUE NA NA NA NA NA NA NA
[2,] NA TRUE NA NA TRUE NA NA NA TRUE NA
[3,] TRUE NA NA NA TRUE NA NA NA NA NA
[4,] NA TRUE NA NA NA NA TRUE TRUE TRUE TRUE
which(
offers a curious, and somewhat mysterious arr.ind = TRUE)
that might allow us to walk our values back into the truth table...
I've found it easiest to take this a an object first
arr_idx_T <- which(smp_mtx == TRUE, arr.ind = TRUE)
arr_idx_T
row col
[1,] 1 1
[2,] 3 1
[3,] 1 2
[4,] 2 2
[5,] 4 2
[6,] 1 3
[7,] 2 5
[8,] 3 5
[9,] 4 7
[10,] 4 8
[11,] 2 9
[12,] 4 9
[13,] 4 10
arr_idx_T[, 2][which(arr_idx_T[,1] == 1)]
[1] 1 2 3
arr_idx_T[, 2][which(arr_idx_T[,1] == 2)]
[1] 2 5 9
giving both columns and values, now just the mechanics of plugging them back in, whose notation is causing my brain to presently slip a cog.
CodePudding user response:
Here is tidyverse
solution using a potpourri of functions from the tidyverse
family:
library(dplyr)
library(readr)
library(tidyr)
smp %>%
mutate(id = row_number()) %>%
separate_rows(x) %>%
type.convert(as.is = TRUE) %>%
arrange(x) %>%
complete(x = first(x):last(x)) %>%
mutate(x = paste0("d_", x)) %>%
count(id, x) %>%
pivot_wider(names_from = x, values_from = n) %>%
filter(row_number() <= n()-1) %>%
mutate(across(-id, ~case_when(. == 1 ~ readr::parse_number(cur_column())))) %>%
select(order(readr::parse_number(names(.))), -id)
d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 NA NA NA NA NA NA NA
2 NA 2 NA NA 5 NA NA NA 9 NA
3 1 NA NA NA 5 NA NA NA NA NA
4 NA 2 NA NA NA NA 7 8 9 10
CodePudding user response:
Here is an alternate approach, where I used the join
data
smp <- data.frame(
x = c("1,2,3", "2,5,9", "1,5", "2,7,8,9,10")
)
code
library(tidyverse)
dummmy <- data.frame(x=1:10) %>% mutate(x=as.character(x))
df <- tibble(x=strsplit(smp$x,',')) %>% mutate(len=row_number(), value=x) %>%
unnest(c(x,value)) %>%
full_join(dummmy, by='x') %>%
mutate(name=paste0('d_',x), x=as.numeric(x), value=as.numeric(value)) %>%
arrange(x) %>%
pivot_wider(len, names_from = name, values_from = value) %>% select(-len) %>%
mutate(sum=rowSums(across(starts_with('d')),na.rm=T)) %>%
filter(sum>0) %>% select(-sum)
Created on 2023-01-21 with reprex v2.0.2
output
# A tibble: 4 × 10
d_1 d_2 d_3 d_4 d_5 d_6 d_7 d_8 d_9 d_10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 3 NA NA NA NA NA NA NA
2 1 NA NA NA 5 NA NA NA NA NA
3 NA 2 NA NA 5 NA NA NA 9 NA
4 NA 2 NA NA NA NA 7 8 9 10