I have a summary table I need to expand into unit-level observations to run test statistics on.
The summary table looks like this
tbl_summmary <-
tibble(
outcome = c("A (%)", "B (%)", "C (%)", "D (%)", "Total (n)"),
group_1 = c(.25, .25, .125, .325, 10),
group_2 = c(.50, 0.0, .325, .125, 20),
group_3 = c(.10, .40, .125, .325, 40))
tbl_summmary
The result needs to be a long format dataframe of the data described in the summary table.
df_simulation <-
bind_rows(
tibble(
group = 1,
outcome = c(
rep("A", .25*10),
rep("B", .25*10),
rep("C", .125*10),
rep("D", .325*10))),
tibble(
group = 2,
outcome = c(
rep("A", .50*20),
rep("B", .00*20),
rep("C", .325*20),
rep("D", .125*20))),
tibble(
group = 3,
outcome = c(
rep("A", .10*40),
rep("B", .40*40),
rep("C", .125*40),
rep("D", .325*40))))
df_simulation
However, the code needs to iterate over multiple variables and multiple groups. Typing out each group and variable manually like in the above won't be scalable. A way of doing this programmatically would be much appreciated!
CodePudding user response:
One option would be to use tidyr::uncount
after some additional data wrangling steps:
library(tidyr)
library(dplyr)
# Get df of totals
totals <- tbl_summmary[nrow(tbl_summmary),] |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
select(group, total = value)
# Get df of outcomes
outcomes <- tbl_summmary[-nrow(tbl_summmary),] |>
mutate(outcome = gsub("^(\\w ).*$", "\\1", outcome)) |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
left_join(totals, by = "group") |>
# We need integers, so use round
mutate(n = round(value * total)) |>
select(group, outcome, n) |>
uncount(n) |>
mutate(group = as.numeric(group)) |>
arrange(group, outcome)
count(outcomes, outcome, group)
#> # A tibble: 11 × 3
#> outcome group n
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 A 2 10
#> 3 A 3 4
#> 4 B 1 2
#> 5 B 3 16
#> 6 C 1 1
#> 7 C 2 6
#> 8 C 3 5
#> 9 D 1 3
#> 10 D 2 2
#> 11 D 3 13
identical(df_simulation, outcomes)
#> [1] TRUE
CodePudding user response:
I do like the approach by @stefan.
If you want the format you requested, it can be done by listing the outcome values by the total amount then unnesting the values, then putting it into long format.
library(dplyr)
library(tidyr)
library(stringr)
df <- tbl_summmary |>
mutate(across(starts_with("group"), ~ .x *.x[5])) |>
rowwise() |>
mutate(across(starts_with("group"), ~
ifelse(.x != 0,
list(rep(str_remove(outcome, " \\(%\\)"), .x)),
list(NA_character_))))
df[-5, -1] |>
unnest_wider(col = everything(), names_sep = "_") |>
stack() |>
mutate(ind = str_extract(str_remove(ind, "_\\d $"), "\\d"), .before = 1) |>
rename(group = ind, outcome = values) |>
drop_na()
outcome group
1 A 1
2 B 1
3 C 1
4 D 1
5 A 1
6 B 1
7 D 1
8 D 1
9 A 2
10 C 2
11 D 2
12 A 2
13 C 2
14 D 2
15 A 2
16 C 2
17 A 2
18 C 2
19 A 2
20 C 2
21 A 2
22 C 2
23 A 2
24 A 2
25 A 2
26 A 2
27 A 3
28 B 3
29 C 3
30 D 3
31 A 3
32 B 3
33 C 3
34 D 3
35 A 3
36 B 3
37 C 3
38 D 3
39 A 3
40 B 3
41 C 3
42 D 3
43 B 3
44 C 3
45 D 3
46 B 3
47 D 3
48 B 3
49 D 3
50 B 3
51 D 3
52 B 3
53 D 3
54 B 3
55 D 3
56 B 3
57 D 3
58 B 3
59 D 3
60 B 3
61 D 3
62 B 3
63 B 3
64 B 3