R apply function by colnames with increasing integer-CodePudding

I am scoring a survey (23 items) with multiple options for each item (it is select all answers that apply, not one choice per item), and trying to find the minimum, maximum and average value for each item. I have written code to do this (below) but am wondering if there is a more efficient way than to cut and paste the three lines creating min, max and avg columns for every item.

Here is the reproducible example with 2 questions, 3 answer options for questions:

#Establish dataframe
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")

#Rescore incorrect value
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)

This is the code to calculate the values (note here Option A, B and C are the answer choices, while the _1 denotes item 1)

##Calculate Values
dt$it1_min <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, min, na.rm=T)
dt$it1_max <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, max, na.rm=T)
dt$it1_avg <- rowMeans(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], na.rm=T)

I am wondering if I need to do the above ^ for every single item, or if it's possible to write a function so that I can score all items (OptA_1 through OptA_23) more efficiently.

###potentially repetitive code? 
dt$it2_min <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, min, na.rm=T)
dt$it2_max <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, max, na.rm=T)
dt$it2_avg <- rowMeans(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], na.rm=T)

Here is what the eventual scoring will look like:

##Eventual scoring
dt$tot_min <- rowSums(dt[ ,c("it1_min", "it2_min")], na.rm=T)
dt$tot_max <- rowSums(dt[ ,c("it1_max", "it2_max")], na.rm=T)
dt$tot_avg <- rowSums(dt[ ,c("it1_avg", "it2_avg")], na.rm=T)

CodePudding user response：

You will need to convert the data to long form first (tidyr::pivot_longer).

library(dplyr)
library(tidyr)

dt_long <- dt %>% 
    # add identifier for participant
    mutate(participant = row_number()) %>% 

    # convert to long form using pattern 
    pivot_longer(cols = -participant, 
                 names_pattern = "Opt(.*)_(\\d )", 
                 names_to = c("answer_choice", "item"), 
                 values_to = "selected")
dt_long

# long form data looks like this

# A tibble: 30 x 4
# participant answer_choice item  selected
# <int> <chr>         <chr>    <dbl>
#     1           1 A             1           NA
#     2           1 B             1            2
#     3           1 C             1           NA
#     4           1 A             2            1
#     5           1 B             2            2
#     6           1 C             2            3


# now group by each participant and item and compute the required fields
dt_long %>% 
    group_by(item, participant) %>% 
    summarise(it_min = min(selected, na.rm = TRUE), 
              it_max = max(selected, na.rm = TRUE), 
              it_avg = mean(selected, na.rm = TRUE))

#> # A tibble: 10 x 5
#> # Groups:   item [2]
#>    item  participant it_min it_max it_avg
#>    <chr>       <int>  <dbl>  <dbl>  <dbl>
#>  1 1               1      2      2    2  
#>  2 1               2      2      2    2  
#>  3 1               3      1      3    2  
#>  4 1               4      2      3    2.5
#>  5 1               5      3      3    3  
#>  6 2               1      1      1    1  
#>  7 2               2      1      3    2  
#>  8 2               3      1      3    2  
#>  9 2               4      1      3    2  
#> 10 2               5      1      2    1.5

CodePudding user response：

You can use data.table to melt your dt long, estimate your indicators by group, and then dcast back to wide format:

library(data.table)

dt = melt(setDT(dt)[, row:=.I], id.vars="row")[, c("variable","grp") := tstrsplit(variable, "_")][]

dcast(dt[, .(it_min = min(value,na.rm=T),
       it_max = max(value,na.rm=T),
       it_avg = mean(value, na.rm=T)
       ), by=.(row,grp)],
      row~grp,
      value.var=c("it_min", "it_max", "it_avg")
)

Output: (note that you used sample() above, without setting a seed, see my reproducible data below)

     row it_min_1 it_min_2 it_max_1 it_max_2 it_avg_1 it_avg_2
   <int>    <num>    <num>    <num>    <num>    <num>    <num>
1:     1        2       NA        3       NA      2.5      NaN
2:     2        2        1        3        3      2.5        2
3:     3        2        3        3        3      2.5        3
4:     4        1       NA        1       NA      1.0      NaN
5:     5        3        3        3        3      3.0        3

Input Data:

set.seed(123)
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")

dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)