I am trying to calculate the mean value of cancellations for each surgery speciality. Here is my dataset:
> dput(dt)
structure(list(Service = c("Cardiac", "Cardiac", "Electrophysiology",
"Gastroenterology", "Gastroenterology", "General", "General",
"Gynecology", "Gynecology", "Gynecology Oncology", "Hepatology",
"Hepatology", "Interventional Radiology", "Interventional Radiology",
"Neuroradiology", "Neuroradiology", "Neurosurgery", "Neurosurgery",
"Ortho Spine", "Orthopedics", "Orthopedics", "Otolaryngology Head and Neck Surgery",
"Otolaryngology Head and Neck Surgery", "Plastics", "Plastics",
"Pulmonary", "Pulmonary", "Thoracic", "Thoracic", "Transplant",
"Transplant", "Urology", "Urology", "Vascular", "Vascular"),
binary = c("Cancelled", "Not Cancelled", "Cancelled", "Cancelled",
"Not Cancelled", "Cancelled", "Not Cancelled", "Cancelled",
"Not Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled", "Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled", "Cancelled", "Not Cancelled",
"Cancelled", "Not Cancelled"), n = c(338L, 38L, 10L, 14L,
6L, 69L, 12L, 31L, 10L, 1L, 3L, 1L, 39L, 6L, 3L, 1L, 113L,
9L, 2L, 74L, 15L, 12L, 1L, 3L, 1L, 1L, 1L, 22L, 2L, 30L,
13L, 70L, 12L, 57L, 4L)), row.names = c(NA, -35L), groups = structure(list(
Service = c("Cardiac", "Electrophysiology", "Gastroenterology",
"General", "Gynecology", "Gynecology Oncology", "Hepatology",
"Interventional Radiology", "Neuroradiology", "Neurosurgery",
"Ortho Spine", "Orthopedics", "Otolaryngology Head and Neck Surgery",
"Plastics", "Pulmonary", "Thoracic", "Transplant", "Urology",
"Vascular"), .rows = structure(list(1:2, 3L, 4:5, 6:7, 8:9,
10L, 11:12, 13:14, 15:16, 17:18, 19L, 20:21, 22:23, 24:25,
26:27, 28:29, 30:31, 32:33, 34:35), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -19L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
For example, for the Cardiac
speciality, I want to calculate the mean number of cancellations as per total number of procedures (38/38 338)
.
I tried this:
dt.1 <- dt %>%
group_by(Service) %>%
summarise(`Cancelled` = mean(binary == "Cancelled")*100,
`Not Cancelled` = mean(!binary == "Cancelled")*100)
but got very weird values.
CodePudding user response:
in base R you would do:
prop.table(xtabs(n~.,dt), 1) * 100
binary
Service Cancelled Not Cancelled
Cardiac 89.893617 10.106383
Electrophysiology 100.000000 0.000000
Gastroenterology 70.000000 30.000000
General 85.185185 14.814815
Gynecology 75.609756 24.390244
Gynecology Oncology 0.000000 100.000000
Hepatology 75.000000 25.000000
Interventional Radiology 86.666667 13.333333
Neuroradiology 75.000000 25.000000
Neurosurgery 92.622951 7.377049
Ortho Spine 100.000000 0.000000
Orthopedics 83.146067 16.853933
Otolaryngology Head and Neck Surgery 92.307692 7.692308
Plastics 75.000000 25.000000
Pulmonary 50.000000 50.000000
Thoracic 91.666667 8.333333
Transplant 69.767442 30.232558
Urology 85.365854 14.634146
Vascular 93.442623 6.557377
if you need a data.frame
back, just wrap the code above as:
as.data.frame.matrix(prop.table(xtabs(n~.,dt), 1)*100)
CodePudding user response:
We may use the logical condition for subsetting the 'n'
library(dplyr)
dt %>%
group_by(Service) %>%
summarise(Perc_not_cancelled =
100 *n[binary != 'Cancelled'][1]/sum(n),
Perc_cancelled = 100 * n[binary == 'Cancelled'][1]/sum(n), .groups = 'drop' )
-output
# A tibble: 19 × 3
Service Perc_not_cancelled Perc_cancelled
<chr> <dbl> <dbl>
1 Cardiac 10.1 89.9
2 Electrophysiology NA 100
3 Gastroenterology 30 70
4 General 14.8 85.2
5 Gynecology 24.4 75.6
6 Gynecology Oncology 100 NA
7 Hepatology 25 75
8 Interventional Radiology 13.3 86.7
9 Neuroradiology 25 75
10 Neurosurgery 7.38 92.6
11 Ortho Spine NA 100
12 Orthopedics 16.9 83.1
13 Otolaryngology Head and Neck Surgery 7.69 92.3
14 Plastics 25 75
15 Pulmonary 50 50
16 Thoracic 8.33 91.7
17 Transplant 30.2 69.8
18 Urology 14.6 85.4
19 Vascular 6.56 93.4
-checking
> 38/(38 338) * 100
[1] 10.10638
If we want to replace the NA
with 0
library(tidyr)
dt %>%
group_by(Service) %>%
summarise(Perc_not_cancelled =
100 *n[binary != 'Cancelled'][1]/sum(n),
Perc_cancelled = 100 * n[binary == 'Cancelled'][1]/sum(n),
.groups = 'drop' ) %>%
mutate(across(where(is.numeric), replace_na, 0))
# A tibble: 19 × 3
Service Perc_not_cancelled Perc_cancelled
<chr> <dbl> <dbl>
1 Cardiac 10.1 89.9
2 Electrophysiology 0 100
3 Gastroenterology 30 70
4 General 14.8 85.2
5 Gynecology 24.4 75.6
6 Gynecology Oncology 100 0
7 Hepatology 25 75
8 Interventional Radiology 13.3 86.7
9 Neuroradiology 25 75
10 Neurosurgery 7.38 92.6
11 Ortho Spine 0 100
12 Orthopedics 16.9 83.1
13 Otolaryngology Head and Neck Surgery 7.69 92.3
14 Plastics 25 75
15 Pulmonary 50 50
16 Thoracic 8.33 91.7
17 Transplant 30.2 69.8
18 Urology 14.6 85.4
19 Vascular 6.56 93.4
Or another option is pivot_wider
dt %>%
group_by(Service) %>%
mutate(n = 100 *proportions(n)) %>%
ungroup %>%
pivot_wider(names_from = "binary", values_from = n,
values_fill = 0, names_glue = "Perc_{.name}")