I'm trying to produce some weighted summary statistics, currently using group_modify()
. For example, say I have this df:
require(tidyverse)
x <- tibble::tribble(
~treatment, ~question, ~bin, ~weight,
"first_name_of_treatment", "didyou", 1L, 5.6,
"first_name_of_treatment", "didthey", 0L, 10.3,
"first_name_of_treatment", "willyou", 1L, 45.1,
"first_name_of_treatment", "willthey", 0L, 2.2,
"first_name_of_treatment", "didntthey", 1L, 1.5,
"another_t_name", "didyou", 0L, 93.4,
"another_t_name", "didthey", NA, NA,
"another_t_name", "willyou", 1L, 52.1,
"another_t_name", "willthey", 0L, 3.9,
"another_t_name", "didntthey", NA, NA
)
If I run the following code, I get the output that's shown below.
x %>% group_by(treatment, question) %>%
group_modify(~
tibble(
n = sum(!is.na(.x)),
mean = round(mean(as.numeric(unlist(.x)), na.rm = T), digits = 2),
sd = round(sd(as.numeric(unlist(.x)), na.rm = T), digits = 2),
se = round(sd/sqrt(n), digits = 3),
ci_lo = round(mean - qnorm(1 - (.05 / 2)) * se, digits = 3), # qnorm() gets the specified Z-score
ci_hi = round(mean qnorm(1 - (.05 / 2)) * se, digits = 3),
count_na = sum(is.na(.x))
)
)
# A tibble: 10 × 9
# Groups: treatment, question [10]
treatment question n mean sd se ci_lo ci_hi count_na
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 another_t_name didntth… 0 NaN NA NA NaN NaN 2
2 another_t_name didthey 0 NaN NA NA NaN NaN 2
3 another_t_name didyou 2 46.7 66.0 46.7 -44.8 138. 0
4 another_t_name willthey 2 1.95 2.76 1.95 -1.88 5.78 0
5 another_t_name willyou 2 26.6 36.1 25.5 -23.5 76.6 0
6 first_name_of_treatment didntth… 2 1.25 0.35 0.247 0.766 1.73 0
7 first_name_of_treatment didthey 2 5.15 7.28 5.15 -4.94 15.2 0
8 first_name_of_treatment didyou 2 3.3 3.25 2.30 -1.20 7.80 0
9 first_name_of_treatment willthey 2 1.1 1.56 1.10 -1.06 3.26 0
10 first_name_of_treatment willyou 2 23.0 31.2 22.0 -20.2 66.3 0
This is close to what I want, but the statistics in the grouped tibble are for the weight
column, whereas what I want is for them to be for the bin
column and apply the weights to that.
So the ideal outcome is: the weighted mean of the bin
column from the original tibble, for each value of treatment
and question
, weighted by the weight
column (probably using weighted.mean
).
If I run select(-weight)
before running group_by(treatment, question)
I get the summary stats for the bin
column (which is what I want), but then I don't have access to the weights which means I can't apply them.
Is it possible to do what I want with a combination of weighted.mean
and group_modify()
? If not, what is a better way I can do this?
CodePudding user response:
require(tidyverse) #updated your tibble with more data
x <- tibble::tribble(
~treatment, ~question, ~bin, ~weight,
"first_name_of_treatment", "didyou", 1L, 5.6,
"first_name_of_treatment", "didthey", 0L, 10.3,
"first_name_of_treatment", "willyou", 1L, 45.1,
"first_name_of_treatment", "willthey", 0L, 2.2,
"first_name_of_treatment", "didntthey", 1L, 1.5,
"first_name_of_treatment", "didyou", 1L, 1.6,
"first_name_of_treatment", "didthey", 0L, 4.3,
"first_name_of_treatment", "willyou", 1L, 70.1,
"first_name_of_treatment", "willthey", 0L, 5.2,
"first_name_of_treatment", "didntthey", 1L, 7.5,
"another_t_name", "didyou", 0L, 93.4,
"another_t_name", "didthey", NA, NA,
"another_t_name", "willyou", 1L, 52.1,
"another_t_name", "willthey", 0L, 3.9,
"another_t_name", "didntthey", NA, NA,
"another_t_name", "didyou", 0L, 30.4,
"another_t_name", "didthey", NA, NA,
"another_t_name", "willyou", 1L, 2.1,
"another_t_name", "willthey", 0L, 13.9,
"another_t_name", "didntthey", NA, NA)
Then used summarize and across combined with mutate to do the calculations
x %>%
group_by(treatment,question,bin) %>%
summarize(
across(weight,
list(
n = ~sum(!is.na(.)),
mean = ~round(mean(as.numeric(., na.rm = T), digits = 2)),
sd = ~round(sd(as.numeric(.), na.rm = T), digits = 2),
count_na = ~sum(is.na(.))
),.names = "{.fn}"
),.groups = "drop"
) %>%
mutate(
se = round(sd/sqrt(n), digits = 3),
ci_lo = round(mean - qnorm(1 - (.05 / 2)) * se, digits = 3),
ci_hi =round(mean qnorm(1 - (.05 / 2)) * se, digits = 3),
)
CodePudding user response:
I found a solution to this just using summarize
and mutate
. It was actually much simpler than I realized!
x %>%
group_by(treatment, question) %>%
summarize(
n = sum(!is.na(bin)),
wt_mean = weighted.mean(bin, weight, na.rm = TRUE),
wt_sd = Hmisc::wtd.var(bin, weight, na.rm = TRUE),
wt_se = round(Hmisc::wtd.var(bin, weight, na.rm = TRUE)/sqrt(sum(!is.na(bin))), digits = 3),
count_na = sum(is.na(bin))
) %>%
mutate(wt_ci_lo = round(wt_mean - qnorm(1 - (.05 / 2)) * wt_se, digits = 3), # qnorm() gets the specified Z-score
wt_ci_hi = round(wt_mean qnorm(1 - (.05 / 2)) * wt_se, digits = 3))