Pass more than one column through to group

I'm trying to produce some weighted summary statistics, currently using group_modify(). For example, say I have this df:

require(tidyverse)

x <- tibble::tribble(
                 ~treatment,   ~question, ~bin, ~weight,
  "first_name_of_treatment",    "didyou",   1L,     5.6,
  "first_name_of_treatment",   "didthey",   0L,    10.3,
  "first_name_of_treatment",   "willyou",   1L,    45.1,
  "first_name_of_treatment",  "willthey",   0L,     2.2,
  "first_name_of_treatment", "didntthey",   1L,     1.5,
           "another_t_name",    "didyou",   0L,    93.4,
           "another_t_name",   "didthey",   NA,      NA,
           "another_t_name",   "willyou",   1L,    52.1,
           "another_t_name",  "willthey",   0L,     3.9,
           "another_t_name", "didntthey",   NA,      NA
  )

If I run the following code, I get the output that's shown below.

x %>% group_by(treatment, question) %>% 
    group_modify(~
                     tibble(
                         n = sum(!is.na(.x)), 
                         mean = round(mean(as.numeric(unlist(.x)), na.rm = T), digits = 2), 
                         sd = round(sd(as.numeric(unlist(.x)), na.rm = T), digits = 2), 
                         se = round(sd/sqrt(n), digits = 3), 
                         ci_lo = round(mean - qnorm(1 - (.05 / 2)) * se, digits = 3),  # qnorm() gets the specified Z-score
                         ci_hi = round(mean   qnorm(1 - (.05 / 2)) * se, digits = 3),
                         count_na = sum(is.na(.x))
                     )
    )

# A tibble: 10 × 9
# Groups:   treatment, question [10]
   treatment               question     n   mean    sd     se   ci_lo  ci_hi count_na
   <chr>                   <chr>    <int>  <dbl> <dbl>  <dbl>   <dbl>  <dbl>    <int>
 1 another_t_name          didntth…     0 NaN    NA    NA     NaN     NaN           2
 2 another_t_name          didthey      0 NaN    NA    NA     NaN     NaN           2
 3 another_t_name          didyou       2  46.7  66.0  46.7   -44.8   138.          0
 4 another_t_name          willthey     2   1.95  2.76  1.95   -1.88    5.78        0
 5 another_t_name          willyou      2  26.6  36.1  25.5   -23.5    76.6         0
 6 first_name_of_treatment didntth…     2   1.25  0.35  0.247   0.766   1.73        0
 7 first_name_of_treatment didthey      2   5.15  7.28  5.15   -4.94   15.2         0
 8 first_name_of_treatment didyou       2   3.3   3.25  2.30   -1.20    7.80        0
 9 first_name_of_treatment willthey     2   1.1   1.56  1.10   -1.06    3.26        0
10 first_name_of_treatment willyou      2  23.0  31.2  22.0   -20.2    66.3         0

This is close to what I want, but the statistics in the grouped tibble are for the weight column, whereas what I want is for them to be for the bin column and apply the weights to that.

So the ideal outcome is: the weighted mean of the bin column from the original tibble, for each value of treatment and question, weighted by the weight column (probably using weighted.mean).

If I run select(-weight) before running group_by(treatment, question) I get the summary stats for the bin column (which is what I want), but then I don't have access to the weights which means I can't apply them.

Is it possible to do what I want with a combination of weighted.mean and group_modify()? If not, what is a better way I can do this?

CodePudding user response：

require(tidyverse) #updated your tibble with more data

x <- tibble::tribble(
                 ~treatment,   ~question, ~bin, ~weight,
  "first_name_of_treatment",    "didyou",   1L,     5.6,
  "first_name_of_treatment",   "didthey",   0L,    10.3,
  "first_name_of_treatment",   "willyou",   1L,    45.1,
  "first_name_of_treatment",  "willthey",   0L,     2.2,
  "first_name_of_treatment", "didntthey",   1L,     1.5,
  "first_name_of_treatment",    "didyou",   1L,     1.6,
  "first_name_of_treatment",   "didthey",   0L,    4.3,
  "first_name_of_treatment",   "willyou",   1L,    70.1,
  "first_name_of_treatment",  "willthey",   0L,     5.2,
  "first_name_of_treatment", "didntthey",   1L,     7.5,
           "another_t_name",    "didyou",   0L,    93.4,
           "another_t_name",   "didthey",   NA,      NA,
           "another_t_name",   "willyou",   1L,    52.1,
           "another_t_name",  "willthey",   0L,     3.9,
           "another_t_name", "didntthey",   NA,      NA,
           "another_t_name",    "didyou",   0L,    30.4,
           "another_t_name",   "didthey",   NA,      NA,
           "another_t_name",   "willyou",   1L,    2.1,
           "another_t_name",  "willthey",   0L,     13.9,
           "another_t_name", "didntthey",   NA,      NA)

Then used summarize and across combined with mutate to do the calculations

x %>% 
  group_by(treatment,question,bin) %>% 
  summarize(
    across(weight,
         list( 
           n = ~sum(!is.na(.)),
           mean = ~round(mean(as.numeric(., na.rm = T), digits = 2)),
           sd = ~round(sd(as.numeric(.), na.rm = T), digits = 2),
           count_na = ~sum(is.na(.))
         ),.names = "{.fn}"
    ),.groups = "drop"
  ) %>% 
  mutate(
    se = round(sd/sqrt(n), digits = 3),
    ci_lo = round(mean - qnorm(1 - (.05 / 2)) * se, digits = 3),
    ci_hi =round(mean   qnorm(1 - (.05 / 2)) * se, digits = 3),
  )

CodePudding user response：

I found a solution to this just using summarize and mutate. It was actually much simpler than I realized!

x %>%
  group_by(treatment, question) %>%
  summarize(
    n = sum(!is.na(bin)),
    wt_mean = weighted.mean(bin, weight, na.rm = TRUE),
    wt_sd = Hmisc::wtd.var(bin, weight, na.rm = TRUE),
    wt_se = round(Hmisc::wtd.var(bin, weight, na.rm = TRUE)/sqrt(sum(!is.na(bin))), digits = 3), 
    count_na = sum(is.na(bin))
  ) %>%
  mutate(wt_ci_lo = round(wt_mean - qnorm(1 - (.05 / 2)) * wt_se, digits = 3),  # qnorm() gets the specified Z-score
         wt_ci_hi = round(wt_mean   qnorm(1 - (.05 / 2)) * wt_se, digits = 3))