Summarize quantity of unordered matches and specify unique strings-CodePudding

I have a data frame like the one below:

dat<-data.frame (col1= c(rep("A", 10), rep("B", 10)),
                 test= c(rep("pre", 5), rep("post", 5), rep("pre", 5), rep("post", 5)), 
                 ID= c("ID_A","ID_B","ID_C","ID_D","ID_E","ID_A","ID_B","ID_C", "ID_D","ID_E","ID_F","ID_G","ID_H","ID_I","ID_J","ID_F","ID_G","ID_H","ID_I","ID_J"),
                 answer= c("1_2_3", "4_6_8", "9_4_1", "3_7_2", "5_7_4", "1_2_3", "6_8_4", "9_4_2", "3_9_1", "5_4_7", "6_5_4", "4_6_8", "3_7_1", "6_7_9", "5_7_4", "8_9_1", "6_8_4", "7_1_4", "6_8_3", "5_4_7"))

Each level of ID has two levels in test - which represent responses to a pre and post test. answer represents the respondents answer.

I wish to summarize the following result data frame res:

res<-data.frame(ID= c("ID_A","ID_B","ID_C", "ID_D","ID_E", "ID_F","ID_G","ID_H","ID_I","ID_J"),
               res1=c("yes", "yes","no", "no", "yes", "no", "yes","no", "no", "yes"), #match? yes or no
               res2=c(0,0,1, 2, 0, 3, 0,1, 2, 0 ),#count of the new answer components
               res3= c(NA, NA, 2, "9_1", NA,"8_9_1",NA, 4, "8_3", NA)) #string specifying new answer components

res summarizes matches in answer for each level of ID irrespective of order and returns:

col1 a binary variable that summarizes the presence or absence of a match between the pre and post test
col2 a count of the number of new entries in the post test. Values can only be between 0-3 since there is a limit of 3 values for each answer.
col3 a string that specifies the new answer components present in the post test but not in the pre test.

CodePudding user response：

We can use strsplit to split the answer string column and match the pre with post responses

library(dplyr)

dat |> group_by(ID) |> select(-col1) |> 
       mutate(res1 = case_when(all(strsplit(nth(answer , 1) , "_")[[1]] %in% strsplit(nth(answer , 2) , "_")[[1]]) ~ "yes" , TRUE ~ "no")) |>
       group_by(ID , res1) |> summarise(res2 = 3 - sum(strsplit(nth(answer , 1) , "_")[[1]] %in% strsplit(nth(answer , 2) , "_")[[1]]) ,
       res3 = paste0(setdiff(strsplit(nth(answer , 2) , "_")[[1]] , strsplit(nth(answer , 1) , "_")[[1]]) , collapse = "_"))

Output

# A tibble: 10 × 4
# Groups:   ID [10]
   ID    res1   res2 res3   
   <chr> <chr> <dbl> <chr>  
 1 ID_A  yes       0 ""     
 2 ID_B  yes       0 ""     
 3 ID_C  no        1 "2"    
 4 ID_D  no        2 "9_1"  
 5 ID_E  yes       0 ""     
 6 ID_F  no        3 "8_9_1"
 7 ID_G  yes       0 ""     
 8 ID_H  no        1 "4"    
 9 ID_I  no        2 "8_3"  
10 ID_J  yes       0 ""

CodePudding user response：

You can create a small helper function f

f <- function(ans) {
  p =  strsplit(ans, "_")
  pdiff = setdiff(p[[2]], p[[1]])
  if(length(pdiff)==0) return(list("res1"="yes", "res2"=0L, "res3"=NA_character_))
  return(list("res1"="no", "res2"=length(pdiff), "res3" = paste0(pdiff, collapse="_")))
}

and apply it by ID:

library(tidyverse)
unnest_wider(at %>% group_by(ID) %>% summarize(res=list(f(answer))),res)

Output:

   ID    res1   res2 res3 
   <chr> <chr> <int> <chr>
 1 ID_A  yes       0 NA   
 2 ID_B  yes       0 NA   
 3 ID_C  no        1 2    
 4 ID_D  no        2 9_1  
 5 ID_E  yes       0 NA   
 6 ID_F  no        3 8_9_1
 7 ID_G  yes       0 NA   
 8 ID_H  no        1 4    
 9 ID_I  no        2 8_3  
10 ID_J  yes       0 NA