Difficulty Calculating Mean of Values in Column-CodePudding

I am trying to calculate the mean value of cancellations for each surgery speciality. Here is my dataset:

> dput(dt)
structure(list(Service = c("Cardiac", "Cardiac", "Electrophysiology", 
"Gastroenterology", "Gastroenterology", "General", "General", 
"Gynecology", "Gynecology", "Gynecology Oncology", "Hepatology", 
"Hepatology", "Interventional Radiology", "Interventional Radiology", 
"Neuroradiology", "Neuroradiology", "Neurosurgery", "Neurosurgery", 
"Ortho Spine", "Orthopedics", "Orthopedics", "Otolaryngology Head and Neck Surgery", 
"Otolaryngology Head and Neck Surgery", "Plastics", "Plastics", 
"Pulmonary", "Pulmonary", "Thoracic", "Thoracic", "Transplant", 
"Transplant", "Urology", "Urology", "Vascular", "Vascular"), 
    binary = c("Cancelled", "Not Cancelled", "Cancelled", "Cancelled", 
    "Not Cancelled", "Cancelled", "Not Cancelled", "Cancelled", 
    "Not Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled", "Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled", 
    "Cancelled", "Not Cancelled"), n = c(338L, 38L, 10L, 14L, 
    6L, 69L, 12L, 31L, 10L, 1L, 3L, 1L, 39L, 6L, 3L, 1L, 113L, 
    9L, 2L, 74L, 15L, 12L, 1L, 3L, 1L, 1L, 1L, 22L, 2L, 30L, 
    13L, 70L, 12L, 57L, 4L)), row.names = c(NA, -35L), groups = structure(list(
    Service = c("Cardiac", "Electrophysiology", "Gastroenterology", 
    "General", "Gynecology", "Gynecology Oncology", "Hepatology", 
    "Interventional Radiology", "Neuroradiology", "Neurosurgery", 
    "Ortho Spine", "Orthopedics", "Otolaryngology Head and Neck Surgery", 
    "Plastics", "Pulmonary", "Thoracic", "Transplant", "Urology", 
    "Vascular"), .rows = structure(list(1:2, 3L, 4:5, 6:7, 8:9, 
        10L, 11:12, 13:14, 15:16, 17:18, 19L, 20:21, 22:23, 24:25, 
        26:27, 28:29, 30:31, 32:33, 34:35), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -19L), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

For example, for the Cardiac speciality, I want to calculate the mean number of cancellations as per total number of procedures (38/38 338).

I tried this:

dt.1 <- dt %>% 
  group_by(Service) %>% 
  summarise(`Cancelled` = mean(binary == "Cancelled")*100, 
            `Not Cancelled` = mean(!binary == "Cancelled")*100)

but got very weird values.

CodePudding user response：

in base R you would do:

prop.table(xtabs(n~.,dt), 1) * 100
                                      binary
Service                                 Cancelled Not Cancelled
  Cardiac                               89.893617     10.106383
  Electrophysiology                    100.000000      0.000000
  Gastroenterology                      70.000000     30.000000
  General                               85.185185     14.814815
  Gynecology                            75.609756     24.390244
  Gynecology Oncology                    0.000000    100.000000
  Hepatology                            75.000000     25.000000
  Interventional Radiology              86.666667     13.333333
  Neuroradiology                        75.000000     25.000000
  Neurosurgery                          92.622951      7.377049
  Ortho Spine                          100.000000      0.000000
  Orthopedics                           83.146067     16.853933
  Otolaryngology Head and Neck Surgery  92.307692      7.692308
  Plastics                              75.000000     25.000000
  Pulmonary                             50.000000     50.000000
  Thoracic                              91.666667      8.333333
  Transplant                            69.767442     30.232558
  Urology                               85.365854     14.634146
  Vascular                              93.442623      6.557377

if you need a data.frame back, just wrap the code above as:

as.data.frame.matrix(prop.table(xtabs(n~.,dt), 1)*100)

CodePudding user response：

We may use the logical condition for subsetting the 'n'

library(dplyr)
dt %>% 
  group_by(Service) %>% 
  summarise(Perc_not_cancelled =
     100 *n[binary != 'Cancelled'][1]/sum(n),
    Perc_cancelled = 100 * n[binary == 'Cancelled'][1]/sum(n), .groups = 'drop' )

-output

# A tibble: 19 × 3
   Service                              Perc_not_cancelled Perc_cancelled
   <chr>                                             <dbl>          <dbl>
 1 Cardiac                                           10.1            89.9
 2 Electrophysiology                                 NA             100  
 3 Gastroenterology                                  30              70  
 4 General                                           14.8            85.2
 5 Gynecology                                        24.4            75.6
 6 Gynecology Oncology                              100              NA  
 7 Hepatology                                        25              75  
 8 Interventional Radiology                          13.3            86.7
 9 Neuroradiology                                    25              75  
10 Neurosurgery                                       7.38           92.6
11 Ortho Spine                                       NA             100  
12 Orthopedics                                       16.9            83.1
13 Otolaryngology Head and Neck Surgery               7.69           92.3
14 Plastics                                          25              75  
15 Pulmonary                                         50              50  
16 Thoracic                                           8.33           91.7
17 Transplant                                        30.2            69.8
18 Urology                                           14.6            85.4
19 Vascular                                           6.56           93.4

-checking

> 38/(38 338) * 100
[1] 10.10638

If we want to replace the NA with 0

library(tidyr)
dt %>% 
  group_by(Service) %>% 
  summarise(Perc_not_cancelled =
     100 *n[binary != 'Cancelled'][1]/sum(n),
    Perc_cancelled = 100 * n[binary == 'Cancelled'][1]/sum(n), 
      .groups = 'drop' ) %>% 
   mutate(across(where(is.numeric), replace_na, 0))
# A tibble: 19 × 3
   Service                              Perc_not_cancelled Perc_cancelled
   <chr>                                             <dbl>          <dbl>
 1 Cardiac                                           10.1            89.9
 2 Electrophysiology                                  0             100  
 3 Gastroenterology                                  30              70  
 4 General                                           14.8            85.2
 5 Gynecology                                        24.4            75.6
 6 Gynecology Oncology                              100               0  
 7 Hepatology                                        25              75  
 8 Interventional Radiology                          13.3            86.7
 9 Neuroradiology                                    25              75  
10 Neurosurgery                                       7.38           92.6
11 Ortho Spine                                        0             100  
12 Orthopedics                                       16.9            83.1
13 Otolaryngology Head and Neck Surgery               7.69           92.3
14 Plastics                                          25              75  
15 Pulmonary                                         50              50  
16 Thoracic                                           8.33           91.7
17 Transplant                                        30.2            69.8
18 Urology                                           14.6            85.4
19 Vascular                                           6.56           93.4

Or another option is pivot_wider

dt %>% 
  group_by(Service) %>% 
  mutate(n = 100 *proportions(n)) %>% 
  ungroup %>%
  pivot_wider(names_from = "binary", values_from = n,
     values_fill = 0, names_glue = "Perc_{.name}")