How to create unit record data from summary table-CodePudding

I have a summary table I need to expand into unit-level observations to run test statistics on.

The summary table looks like this

tbl_summmary <-
  tibble(
    outcome = c("A (%)", "B (%)", "C (%)", "D (%)", "Total (n)"),
    group_1 = c(.25, .25, .125, .325, 10),
    group_2 = c(.50, 0.0, .325, .125, 20),
    group_3 = c(.10, .40, .125, .325, 40))

tbl_summmary

The result needs to be a long format dataframe of the data described in the summary table.

df_simulation <- 
  bind_rows(
    tibble(
      group = 1,
      outcome = c(
        rep("A", .25*10), 
        rep("B", .25*10), 
        rep("C", .125*10), 
        rep("D", .325*10))),
    tibble(
      group = 2,
      outcome = c(
        rep("A", .50*20), 
        rep("B", .00*20), 
        rep("C", .325*20), 
        rep("D", .125*20))),
    tibble(
      group = 3,
      outcome = c(
        rep("A", .10*40), 
        rep("B", .40*40), 
        rep("C", .125*40), 
        rep("D", .325*40))))

df_simulation

However, the code needs to iterate over multiple variables and multiple groups. Typing out each group and variable manually like in the above won't be scalable. A way of doing this programmatically would be much appreciated!

CodePudding user response：

One option would be to use tidyr::uncount after some additional data wrangling steps:

library(tidyr)
library(dplyr)

# Get df of totals
totals <- tbl_summmary[nrow(tbl_summmary),] |> 
  pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |> 
  select(group, total = value)
# Get df of outcomes
outcomes <- tbl_summmary[-nrow(tbl_summmary),] |> 
  mutate(outcome = gsub("^(\\w ).*$", "\\1", outcome)) |> 
  pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |> 
  left_join(totals, by = "group") |> 
  # We need integers, so use round
  mutate(n = round(value * total)) |>
  select(group, outcome, n) |> 
  uncount(n) |> 
  mutate(group = as.numeric(group)) |> 
  arrange(group, outcome)

count(outcomes, outcome, group)
#> # A tibble: 11 × 3
#>    outcome group     n
#>    <chr>   <dbl> <int>
#>  1 A           1     2
#>  2 A           2    10
#>  3 A           3     4
#>  4 B           1     2
#>  5 B           3    16
#>  6 C           1     1
#>  7 C           2     6
#>  8 C           3     5
#>  9 D           1     3
#> 10 D           2     2
#> 11 D           3    13

identical(df_simulation, outcomes)
#> [1] TRUE

CodePudding user response：

I do like the approach by @stefan.

If you want the format you requested, it can be done by listing the outcome values by the total amount then unnesting the values, then putting it into long format.

library(dplyr)
library(tidyr)
library(stringr)

df <- tbl_summmary |> 
      mutate(across(starts_with("group"), ~ .x *.x[5])) |> 
      rowwise() |> 
      mutate(across(starts_with("group"), ~ 
                      ifelse(.x != 0,
                             list(rep(str_remove(outcome, " \\(%\\)"), .x)),
                             list(NA_character_))))
df[-5, -1] |> 
  unnest_wider(col = everything(), names_sep = "_") |> 
  stack() |> 
  mutate(ind = str_extract(str_remove(ind, "_\\d $"), "\\d"), .before = 1) |> 
  rename(group = ind, outcome = values) |> 
  drop_na()

   outcome group
1        A     1
2        B     1
3        C     1
4        D     1
5        A     1
6        B     1
7        D     1
8        D     1
9        A     2
10       C     2
11       D     2
12       A     2
13       C     2
14       D     2
15       A     2
16       C     2
17       A     2
18       C     2
19       A     2
20       C     2
21       A     2
22       C     2
23       A     2
24       A     2
25       A     2
26       A     2
27       A     3
28       B     3
29       C     3
30       D     3
31       A     3
32       B     3
33       C     3
34       D     3
35       A     3
36       B     3
37       C     3
38       D     3
39       A     3
40       B     3
41       C     3
42       D     3
43       B     3
44       C     3
45       D     3
46       B     3
47       D     3
48       B     3
49       D     3
50       B     3
51       D     3
52       B     3
53       D     3
54       B     3
55       D     3
56       B     3
57       D     3
58       B     3
59       D     3
60       B     3
61       D     3
62       B     3
63       B     3
64       B     3