Pivot longer multiple columns in tibble-CodePudding

Say I have this data frame:

structure(list(uniqueID = c(1234L, 8321L, 4592L, 6166L, 9801L, 
9733L, 2103L, 2108L), treatment = c("treatment_desc", "another_way", 
"text_to_describe", "im_a_treatment", "treatment_five", "more_text_tr", 
"signaling", "or_not_tr"), some_text_bin = c(0L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L), other_string_bin = c(1L, 1L, 1L, NA, 1L, 1L, 
0L, 0L), words_bin = c(0L, NA, 0L, 1L, 0L, 1L, 0L, 1L), something_else_bin = c(0L, 
1L, 0L, 0L, 0L, 0L, NA, 0L)), class = "data.frame", row.names = c(NA, 
-8L))

Which looks like this:

  uniqueID        treatment some_text_bin other_string_bin words_bin something_else_bin
1     1234   treatment_desc             0                1         0                  0
2     8321      another_way             1                1        NA                  1
3     4592 text_to_describe             1                1         0                  0
4     6166   im_a_treatment             0               NA         1                  0
5     9801   treatment_five             1                1         0                  0
6     9733     more_text_tr             0                1         1                  0
7     2103        signaling             0                0         0                 NA
8     2108        or_not_tr             0                0         1                  0

I want to pivot it longer so that I can have one column for the mean, one for the sd, and one for the count of non-missing values. For example, it would look something like this:

   uniqueID        treatment         new_column mean sd  n
1      1234   treatment_desc          some_text   NA NA NA
2      8321      another_way          some_text   NA NA NA
3      4592 text_to_describe          some_text   NA NA NA
4      6166   im_a_treatment          some_text   NA NA NA
5      9801   treatment_five          some_text   NA NA NA
6      9733     more_text_tr          some_text   NA NA NA
7      2103        signaling          some_text   NA NA NA
8      2108        or_not_tr          some_text   NA NA NA
9      1234   treatment_desc   other_string_bin   NA NA NA
10     8321      another_way   other_string_bin   NA NA NA
11     4592 text_to_describe   other_string_bin   NA NA NA
12     6166   im_a_treatment   other_string_bin   NA NA NA
13     9801   treatment_five   other_string_bin   NA NA NA
14     9733     more_text_tr   other_string_bin   NA NA NA
15     2103        signaling   other_string_bin   NA NA NA
16     2108        or_not_tr   other_string_bin   NA NA NA
17     1234   treatment_desc          words_bin   NA NA NA
18     8321      another_way          words_bin   NA NA NA
19     4592 text_to_describe          words_bin   NA NA NA
20     6166   im_a_treatment          words_bin   NA NA NA
21     9801   treatment_five          words_bin   NA NA NA
22     9733     more_text_tr          words_bin   NA NA NA
23     2103        signaling          words_bin   NA NA NA
24     2108        or_not_tr          words_bin   NA NA NA
25     1234   treatment_desc something_else_bin   NA NA NA
26     8321      another_way something_else_bin   NA NA NA
27     4592 text_to_describe something_else_bin   NA NA NA
28     6166   im_a_treatment something_else_bin   NA NA NA
29     9801   treatment_five something_else_bin   NA NA NA
30     9733     more_text_tr something_else_bin   NA NA NA
31     2103        signaling something_else_bin   NA NA NA
32     2108        or_not_tr something_else_bin   NA NA NA

Except, where the NA values are not NA. I've got somewhat close to what I want using this code:

require(tidyverse) 

df %>% 
    group_by(treatment) %>% 
    summarize(across(ends_with("_bin"), 
    list(mean = mean, sd = sd, n = ~ sum(!is.na(.x))),
     na.rm = TRUE, 
     .names = "{.fn}_{.col}")) %>% 
     pivot_longer(col = starts_with("mean"), names_to = "question", names_prefix = "mean_", values_to = "mean")

But this only does what I want for the mean column. I've tried piping the output into another pivot_longer to get the sd (and then the N) but this is returning errors.

Does anyone have any thoughts on the best approach here?

Another possible solution, that I am close to making work is something like this:

      group_by(treatment) %>% 
      select(ends_with("bin"), treatment) %>% 
      group_modify(~  
          tibble( 
               n = sum(!is.na(.x)), 
               mean = round(mean(as.numeric(.x), na.rm = T)), 
               sd = round(sd(as.numeric(.x), na.rm = T)), 
               se = round(sd/sqrt(n)), 
               count_na = sum(is.na(.x)) 
           ) 
       )

I actually strongly prefer this second solution if it can work for my situation.

CodePudding user response：

You could use pivot_longer to first put it in long format and then use mutate from dplyr.

tbl_df_long <- pivot_longer(
  data = tbl_df, some_text_bin:something_else_bin,
  names_to = c("new_column"),
  values_to = c("values"))

tbl_df_long <- tbl_df_long %>% 
  group_by(new_column) %>% 
  mutate(mean = round(mean(as.numeric(values), na.rm = TRUE)),
         sd = round(sd(as.numeric(values), na.rm = TRUE))) %>% 
  ungroup()

CodePudding user response：

Another approach i've arrived at is:

df %>% 
    select(ends_with("bin"), treatment) %>% 
    pivot_longer(!treatment, names_to = "question", values_to = "scores") %>% 
    group_by(treatment, question) %>% 
    group_modify(~
        tibble(
            n = sum(!is.na(.x)), 
            mean = round(mean(as.numeric(unlist(.x)), na.rm = T)), 
            sd = round(sd(as.numeric(unlist(.x)), na.rm = T)), 
            se = round(sd/sqrt(n), digits = 3), 
            ci_lo = round(mean - qnorm(1 - (.05 / 2)) * se),  # qnorm() gets the specified Z-score
            ci_hi = round(mean   qnorm(1 - (.05 / 2)) * se),
            count_na = sum(is.na(.x))
        )
)