Logic operations with string/character data in R-CodePudding

I want to make scoring at an experiment that I ran in R. In this experiment, the subjects are asked multiple choice questions, to which there was one correct answer. And their responses are saved as verbal data in .csv. Here is the head of the data. The first row represents the correct answer:

    data <- structure(list(PRE_TR.1 = c(1L, 1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}", 
                                                              "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}", 
                "{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}", 
                "{\"WQ4\":\"<strong>B.</strong> The large street\"}"), PRE_RULE_LLN = c("A", 
                                                                                        "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}", 
                                                                                        "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
                ), PRE_IS.1 = c(0L, 2L, 2L), PRE_IS.2 = c(3L, 3L, 3L), PRE_IS.3 = c(0L, 
                                                                                    0L, 3L), PRE_IS.4 = c(3L, 3L, 3L), PRE_IS.5 = c(2L, 2L, 1L), 
PRE_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}", 
             "{\"WQ1\":\"<strong>D.</strong> ItÃƒÂ¢Ã¢â\200šÂ¬Ã¢â\200žÂ¢s impossible to predict the value of the standard deviation.\"}"
), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of  sample means have a smaller and smaller standard deviation\"}", 
                    "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of  sample means have a smaller and smaller standard deviation\"}"
), PT.1 = c(NA, TRUE, TRUE), PT.2 = c(NA, TRUE, TRUE), PT.3 = c(NA, 
                                                                TRUE, TRUE), PT.4 = c(NA, TRUE, TRUE), PT.5 = c(NA, TRUE, 
                                                                                                                TRUE), PT.6 = c(NA, TRUE, TRUE), PT.7 = c(NA, TRUE, TRUE), 
PT.8 = c(NA, TRUE, TRUE), PT.9 = c(NA, TRUE, TRUE), PT.10 = c(NA, 
                                                              TRUE, FALSE), POST_IS.2 = c(3L, 3L, 0L), POST_IS.3 = c(0L, 
                                                                                                                     0L, 3L), POST_IS.4 = c(3L, 3L, 0L), POST_IS.5 = c(2L, 2L, 
                                                                                                                                                                       0L), POST_IS.1 = c(0L, 0L, 0L), POST_TR.1 = c(1L, 1L, 1L), 
POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}", 
              "{\"WQ1\":\"<strong>D.</strong> ItÃƒÂ¢Ã¢â\200šÂ¬Ã¢â\200žÂ¢s impossible to predict the value of the standard deviation.\"}"
), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}", 
                 "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}", 
                 "{\"WQ3\":\"<strong>D.</strong> There is no basis for predicting which post office would have the greater number of days on which mean heights were over 71 inches.\"}"
), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}", 
                 "{\"WQ4\":\"<strong>B.</strong> The large street\"}"), POST_RULE_CLT = c("A", 
                                                                                          "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of  sample means have a smaller and smaller standard deviation\"}", 
                                                                                          "{\"R1\":\"<strong>D.</strong> As the sample size increases, the distribution of sample means have a similar standard deviation to that of the population.\"}"
                 ), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}", 
                                      "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
                 ), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}", 
                             "{\"Q0\":\"\\nif the sample mean increases, will inevitably get closer to the population mean because the sample size is becoming closer to the population size.\"}"
                 ), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers wonÃ¢â‚¬â„¢t throw off the calculation\"}", 
                             "{\"Q0\":\"\"}"), TIME = c(NA, 508432L, 2180078L)), row.names = c(NA, 
                                                                                               3L), class = "data.frame")

I need to score these answer. The scoring system should work like this: For all columns that start with "PRE_TR", each participant (row) should receive a score of "TOTAL PRE_TR" by summing their correct answers in these columns. Then they need to receive a score of "TOTAL PRE_IS" by summing their correct answers in the columns that start with "PRE_IS" and so on... Below I show head of an example output that I want:

output <- structure(list(PRE_TR.1 = c(1L, 1L), PRE_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
), PRE_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
), PRE_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
), PRE_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
), PRE_IS.1 = c(0L, 2L), PRE_IS.2 = c(3L, 3L), PRE_IS.3 = c(0L, 
                                                            0L), PRE_IS.4 = c(3L, 3L), PRE_IS.5 = c(2L, 2L), PRE_TR.5 = c("C", 
                                                                                                                          "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
                                                            ), PRE_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of  sample means have a smaller and smaller standard deviation\"}"
                                                            ), PT.1 = c(NA, TRUE), PT.2 = c(NA, TRUE), PT.3 = c(NA, TRUE), 
PT.4 = c(NA, TRUE), PT.5 = c(NA, TRUE), PT.6 = c(NA, TRUE
), PT.7 = c(NA, TRUE), PT.8 = c(NA, TRUE), PT.9 = c(NA, TRUE
), PT.10 = c(NA, TRUE), POST_IS.2 = c(3L, 3L), POST_IS.3 = c(0L, 
                                                             0L), POST_IS.4 = c(3L, 3L), POST_IS.5 = c(2L, 2L), POST_IS.1 = c(0L, 
                                                                                                                              0L), POST_TR.1 = c(1L, 1L), POST_TR.5 = c("C", "{\"WQ1\":\"<strong>A. </strong> a standard deviation equal to 3 inches\"}"
                                                                                                                              ), POST_TR.2 = c("B", "{\"WQ2\":\"<strong>B.</strong> You should go to the large office.\"}"
                                                                                                                              ), POST_TR.3 = c("A", "{\"WQ3\":\"<strong>A. </strong> The number of days on which mean heights were over 71 inches would be greater for the small post office than for the large post office.\"}"
                                                                                                                              ), POST_TR.4 = c("B", "{\"WQ4\":\"<strong>B.</strong> The large street\"}"
                                                                                                                              ), POST_RULE_CLT = c("A", "{\"R1\":\"<strong>A. </strong> As the sample size increases, the distribution of  sample means have a smaller and smaller standard deviation\"}"
                                                                                                                              ), POST_RULE_LLN = c("A", "{\"R2\":\"<strong>A. </strong> As the sample size increases, its mean will tend to be closer to that of the population\"}"
                                                                                                                              ), Exp1 = c("", "{\"Q0\":\"You are drawing a sample that is more like the population in size as well as having more subjects accounts for possibly outliers\"}"
                                                                                                                              ), Exp2 = c("", "{\"Q0\":\"As your sample size increases, the means calculated will be around the population and outliers wonÃ¢â‚¬â„¢t throw off the calculation\"}"
                                                                                                                              ), TIME = c(NA, 508432L), TOTAL_PRE_TR = c(NA, 4L), TOTAL_PRE_IS = c(NA, 
                                                                                                                                                                                                   4L), TOTAL_PRE_RULE = c(NA, 1L), TOTAL_T = c(NA, 10L), TOTAL_POST_TR = c(NA, 
                                                                                                                                                                                                                                                                            4L), TOTAL_POST_IS = c(NA, 5L), TOTAL_POST_RULE = c(NA, 2L
                                                                                                                                                                                                                                                                            )), row.names = 1:2, class = "data.frame")

How to do this, considering that the verbal answers are very long, and I just want to be able to select one word in the correct option to make it easier? An algoritm such as "If the answer at the column PRE_TR1 includes "B", add one point for the score TOTAL_PRE_TR.

CodePudding user response：

Here's my first pass at it. There's room for improvement here, but maybe it will set you on the right direction. This question seems well suited for dplyr, and I'd be very interested in seeing answer using that package. Here's how you could do this in base:

The basic idea is to use pattern matching with grepl() to detect correct answers. That's straight forward because we can use the apply() family to compare each row of your response dataset to your dataset of correct answers.

The way I see it, the challenging bit is to score responses only within a group of answers. I do that here with more pattern matching to find unique groups, then calculating a sum of scores within each group in a for-loop.

Are these the results you were expecting?

# define dataset of correct answers only
ans <- data[1,]
# define dataset of responses only
data <- data[-1,]

# Get column indices for each group of answers
vars <- unique(gsub(pattern = ".?\\d ", replacement = "", names(data)))
vars
#>  [1] "PRE_TR"        "PRE_RULE_LLN"  "PRE_IS"        "PRE_RULE_CLT" 
#>  [5] "PT"            "POST_IS"       "POST_TR"       "POST_RULE_CLT"
#>  [9] "POST_RULE_LLN" "Ex"            "TIME"
colgroups <- sapply(1:length(vars), FUN = function(x) grep(vars[x], names(data)))

# Detect correct answers
scores <- sapply(1:ncol(data), FUN = function(x) ifelse(grepl(pattern = ans[1,x], x = data[,x]),1,0))
scores
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
#> [1,]    1    1    1    1    1    0    1    1    1     1     0     1    NA    NA
#> [2,]    1    1    0    1    1    0    1    0    1     0     0     1    NA    NA
#>      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
#> [1,]    NA    NA    NA    NA    NA    NA    NA    NA     1     1     1     1
#> [2,]    NA    NA    NA    NA    NA    NA    NA    NA     0     0     0     0
#>      [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
#> [1,]     1     1     0     1     1     1     1     1     1     1    NA
#> [2,]     1     1     0     1     0     1     1     1     1     1    NA

# Make output dataframe
output <- data.frame(matrix(ncol = length(vars), nrow = nrow(data)))
names(output) <- vars
output
#>   PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1     NA           NA     NA           NA NA      NA      NA            NA
#> 2     NA           NA     NA           NA NA      NA      NA            NA
#>   POST_RULE_LLN Ex TIME
#> 1            NA NA   NA
#> 2            NA NA   NA

# Sum correct answers one group at a time
scores <- data.frame(scores)
for(i in 1:length(colgroups)){
  # We only need to sum if a group has more than one response
  # Multiple respones per group
  if(length(colgroups[[i]]) > 1){
    output[,vars[i]] <- rowSums(scores[,colgroups[[i]]])
  } else {
    # One response per group
    output[,vars[i]] <- scores[,colgroups[[i]]]
  }
}

output
#>   PRE_TR PRE_RULE_LLN PRE_IS PRE_RULE_CLT PT POST_IS POST_TR POST_RULE_CLT
#> 1      4            1      4            1 NA       5       4             1
#> 2      3            1      2            1 NA       1       3             1
#>   POST_RULE_LLN Ex TIME
#> 1             1  2   NA
#> 2             1  2   NA
Created on 2021-10-08 by the reprex package (v2.0.1)

CodePudding user response：

data 
library(tidyverse)

fNoCharacter = function(x) !is.character(x)

dfCorrect = data %>% as_tibble() %>% slice_head() %>% 
  mutate(PRE_TR.1 = PRE_TR.1 %>% paste0()) %>% 
  mutate_if(fNoCharacter, paste0)

dfAnswers = data %>% as_tibble() %>% slice(2:nrow(.)) %>% 
  mutate(id = 1:nrow(.)) %>% 
  mutate_if(fNoCharacter, paste0)

fGetAnswerPRE_TR = function(x) ifelse(is.na(str_match(x, "(<strong>)(.)")[3]), x, 
                                      str_match(x, "(<strong>)(.)")[3])
fGetAnswerPRE_TR = Vectorize(fGetAnswerPRE_TR)

fCorrect = function(val, start_w) val==dfCorrect %>% 
    pivot_longer(starts_with(start_w)) %>% pull(value)
  
dSumCorrect = function(data, AnsName){
  df1<<-data
  data %>% 
  pivot_longer(starts_with(AnsName)) %>% 
  mutate(Correct = fCorrect(fGetAnswerPRE_TR(value), AnsName)) %>% 
  pull(Correct) %>% sum()
}

dfAnswers %>% group_by(id) %>% 
  nest() %>% 
  mutate(`TOTAL PRE_TR` = map(data, ~dSumCorrect(.x,"PRE_TR")),
         `TOTAL PRE_IS` = map(data, ~dSumCorrect(.x,"PRE_IS")),
         `TOTAL PRE_RULE` = map(data, ~dSumCorrect(.x,"PRE_RULE")),
         `TOTAL PT` = map(data, ~dSumCorrect(.x,"PT")),
         `TOTAL POST_IS` = map(data, ~dSumCorrect(.x,"POST_IS")),
         `TOTAL POST_TR` = map(data, ~dSumCorrect(.x,"POST_TR")),
         `TOTAL POST_RULE` = map(data, ~dSumCorrect(.x,"POST_RULE"))) %>%
  unnest(c(`TOTAL PRE_TR`:`TOTAL POST_RULE`))

output

# A tibble: 2 x 9
# Groups:   id [2]
  id    data              `TOTAL PRE_TR` `TOTAL PRE_IS` `TOTAL PRE_RULE` `TOTAL PT` `TOTAL POST_IS` `TOTAL POST_TR` `TOTAL POST_RULE`
  <chr> <list>                     <int>          <int>            <int>      <int>           <int>           <int>             <int>
1 1     <tibble [1 x 37]>              4              4                2          0               5               4                 2
2 2     <tibble [1 x 37]>              3              2                2          0               1               3                 1