Transposing specific columns to the rows in R-CodePudding

I need to transform some column values into rows for specific rows.

Here is a sample dataset:

df <- data.frame(
  student_id = c(1,1,1,2,2,2),
  question_id = c(10,11,12,20,21,22),
  score = c(2,3,4,2,1,5),
  A = c(NA,NA,1, NA,NA,1),
  B = c(NA,NA,2, NA,NA,1),
  C = c(NA,NA,1, NA,NA,3)
)

> df
  student_id question_id score  C  E  O
1          1          10     2 NA NA NA
2          1          11     3 NA NA NA
3          1          12     4  1  2  1
4          2          20     2 NA NA NA
5          2          21     1 NA NA NA
6          2          22     5  1  1  3

For those rows that have values in A, B, and C columns, I need to grab those values into the score column and add the suffices to define the question_id. For example, the third row's score is the total of these three columns (A, B, C).

My desired output is below.

> df
   student_id question_id score
1           1          10     2
2           1          11     3
3           1        12_C     1
4           1        12_E     2
5           1        12_O     1
6           2          20     2
7           2          21     1
8           2        22_C     1
9           2        22_E     1
10          2        22_O     3

CodePudding user response：

library(dplyr)
library(tidyr)

  df %>% 
  pivot_longer(cols = c(A, B, C)) %>% 
  mutate(question_id = ifelse(!is.na(value), paste(question_id, name, sep = "_"), question_id),
         score = ifelse(!is.na(value), value, score)) %>% 
  select(-c(name, value)) %>% 
  distinct()
#> # A tibble: 10 x 3
#>    student_id question_id score
#>         <dbl> <chr>       <dbl>
#>  1          1 10              2
#>  2          1 11              3
#>  3          1 12_A            1
#>  4          1 12_B            2
#>  5          1 12_C            1
#>  6          2 20              2
#>  7          2 21              1
#>  8          2 22_A            1
#>  9          2 22_B            1
#> 10          2 22_C            3

^{Created on 2021-09-15 by the reprex package (v2.0.0)}

data

df <- data.frame(
  student_id = c(1,1,1,2,2,2),
  question_id = c(10,11,12,20,21,22),
  score = c(2,3,4,2,1,5),
  A = c(NA,NA,1, NA,NA,1),
  B = c(NA,NA,2, NA,NA,1),
  C = c(NA,NA,1, NA,NA,3))

CodePudding user response：

Another option is to replace the summed score (i.e. in question 12 and 22) with NA. From there you can use pivot_longer with score:C and values_drop_na. Then convert the rows where name is score to NA. Finally unite the question_id and name columns.

df %>%
  mutate(score = ifelse(!is.na(A), NA, score)) %>%
  pivot_longer('score':'C', values_drop_na = TRUE) %>%
  mutate(name = na_if(name, 'score')) %>%
  unite('question_id', c(question_id, name), na.rm = T)

#------
# A tibble: 10 x 3
   student_id question_id value
        <dbl> <chr>       <dbl>
 1          1 10              2
 2          1 11              3
 3          1 12_A            1
 4          1 12_B            2
 5          1 12_C            1
 6          2 20              2
 7          2 21              1
 8          2 22_A            1
 9          2 22_B            1
10          2 22_C            3

CodePudding user response：

Data

df <- data.frame(
  student_id = c(1,1,1,2,2,2),
  question_id = c(10,11,12,20,21,22),
  score = c(2,3,4,2,1,5),
  C = c(NA,NA,1, NA,NA,1),
  E = c(NA,NA,2, NA,NA,1),
  O = c(NA,NA,1, NA,NA,3)
)

How to

library(tidyverse)
df %>% 
  pivot_longer(cols = c(C,E,O)) %>% 
  filter(!is.na(value)) %>% 
  mutate(question_id = str_c(question_id,name,sep = "_")) %>% 
  select(-name,-value)

  student_id question_id score
       <dbl> <chr>       <dbl>
1          1 12_C            4
2          1 12_E            4
3          1 12_O            4
4          2 22_C            5
5          2 22_E            5
6          2 22_O            5

CodePudding user response：

I'm not sure this is any better than just adding distinct() at the end as in another answer, but I was curious if you could do this without pivoting the rows with NA values so here's that:

df <- data.frame(
  student_id = c(1,1,1,2,2,2),
  question_id = c(10,11,12,20,21,22),
  score = c(2,3,4,2,1,5),
  A = c(NA,NA,1, NA,NA,1),
  B = c(NA,NA,2, NA,NA,1),
  C = c(NA,NA,1, NA,NA,3)
)

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

df %>% 
  mutate(make_long = !do.call(pmax, across(c(A, B, C), is.na))) %>% 
  group_by(ml = make_long) %>% 
  group_modify(~ {
    if (first(.x$make_long))
      pivot_longer(.x, c(A, B, C)) %>% 
        transmute(
          student_id, 
          question_id = paste(question_id, name, sep = '_'),
          score = value)
     else 
       transmute(.x, 
         student_id, 
         question_id = as.character(question_id), 
         score)
  }) %>% 
  ungroup() %>% 
  select(-ml)
#> # A tibble: 10 × 3
#>    student_id question_id score
#>         <dbl> <chr>       <dbl>
#>  1          1 10              2
#>  2          1 11              3
#>  3          2 20              2
#>  4          2 21              1
#>  5          1 12_A            1
#>  6          1 12_B            2
#>  7          1 12_C            1
#>  8          2 22_A            1
#>  9          2 22_B            1
#> 10          2 22_C            3

^{Created on 2021-09-15 by the reprex package (v2.0.1)}