I have a data frame like the one below:
dat<-data.frame (col1= c(rep("A", 10), rep("B", 10)),
test= c(rep("pre", 5), rep("post", 5), rep("pre", 5), rep("post", 5)),
ID= c("ID_A","ID_B","ID_C","ID_D","ID_E","ID_A","ID_B","ID_C", "ID_D","ID_E","ID_F","ID_G","ID_H","ID_I","ID_J","ID_F","ID_G","ID_H","ID_I","ID_J"),
answer= c("1_2_3", "4_6_8", "9_4_1", "3_7_2", "5_7_4", "1_2_3", "6_8_4", "9_4_2", "3_9_1", "5_4_7", "6_5_4", "4_6_8", "3_7_1", "6_7_9", "5_7_4", "8_9_1", "6_8_4", "7_1_4", "6_8_3", "5_4_7"))
Each level of ID
has two levels in test
- which represent responses to a pre and post test. answer
represents the respondents answer.
I wish to summarize the following result data frame res
:
res<-data.frame(ID= c("ID_A","ID_B","ID_C", "ID_D","ID_E", "ID_F","ID_G","ID_H","ID_I","ID_J"),
res1=c("yes", "yes","no", "no", "yes", "no", "yes","no", "no", "yes"), #match? yes or no
res2=c(0,0,1, 2, 0, 3, 0,1, 2, 0 ),#count of the new answer components
res3= c(NA, NA, 2, "9_1", NA,"8_9_1",NA, 4, "8_3", NA)) #string specifying new answer components
res
summarizes matches in answer
for each level of ID
irrespective of order and returns:
col1
a binary variable that summarizes the presence or absence of a match between the pre and post testcol2
a count of the number of new entries in the post test. Values can only be between 0-3 since there is a limit of 3 values for each answer.col3
a string that specifies the new answer components present in the post test but not in the pre test.
CodePudding user response:
- We can use
strsplit
to split theanswer
string column and match thepre
withpost
responses
library(dplyr)
dat |> group_by(ID) |> select(-col1) |>
mutate(res1 = case_when(all(strsplit(nth(answer , 1) , "_")[[1]] %in% strsplit(nth(answer , 2) , "_")[[1]]) ~ "yes" , TRUE ~ "no")) |>
group_by(ID , res1) |> summarise(res2 = 3 - sum(strsplit(nth(answer , 1) , "_")[[1]] %in% strsplit(nth(answer , 2) , "_")[[1]]) ,
res3 = paste0(setdiff(strsplit(nth(answer , 2) , "_")[[1]] , strsplit(nth(answer , 1) , "_")[[1]]) , collapse = "_"))
- Output
# A tibble: 10 × 4
# Groups: ID [10]
ID res1 res2 res3
<chr> <chr> <dbl> <chr>
1 ID_A yes 0 ""
2 ID_B yes 0 ""
3 ID_C no 1 "2"
4 ID_D no 2 "9_1"
5 ID_E yes 0 ""
6 ID_F no 3 "8_9_1"
7 ID_G yes 0 ""
8 ID_H no 1 "4"
9 ID_I no 2 "8_3"
10 ID_J yes 0 ""
CodePudding user response:
You can create a small helper function f
f <- function(ans) {
p = strsplit(ans, "_")
pdiff = setdiff(p[[2]], p[[1]])
if(length(pdiff)==0) return(list("res1"="yes", "res2"=0L, "res3"=NA_character_))
return(list("res1"="no", "res2"=length(pdiff), "res3" = paste0(pdiff, collapse="_")))
}
and apply it by ID:
library(tidyverse)
unnest_wider(at %>% group_by(ID) %>% summarize(res=list(f(answer))),res)
Output:
ID res1 res2 res3
<chr> <chr> <int> <chr>
1 ID_A yes 0 NA
2 ID_B yes 0 NA
3 ID_C no 1 2
4 ID_D no 2 9_1
5 ID_E yes 0 NA
6 ID_F no 3 8_9_1
7 ID_G yes 0 NA
8 ID_H no 1 4
9 ID_I no 2 8_3
10 ID_J yes 0 NA