Home > Software engineering >  How to calculate a proportion in R
How to calculate a proportion in R

Time:03-26

I have this reproducible DataFrame:

 structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498, 
 79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897), 
     death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male", 
     "female", "female", "female", "female", "male", "male", "male", 
     "male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0, 
     0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029, 
     4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer", 
     "Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis", 
     "Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer", 
     "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF", 
     "Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", 
     "Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11, 
     12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k", 
     "$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k", 
     NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26, 
     26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884, 
     30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
     ), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
     ), race = c("other", "white", "white", "white", "white", 
     "white", "white", "white", "black", "hispanic"), sps = c(33.8984375, 
     52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875, 
     21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19, 
     30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275, 
     0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336, 
     0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341, 
     0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852, 
     0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4, 
     1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1, 
     0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic", 
     "no", "no", "metastatic", "no", "no", "no", "no", "metastatic", 
     "metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619, 
     0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0, 
     0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999, 
     NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr", 
     "no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"), 
     dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97, 
     43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562, 
     8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094, 
     9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101, 
     120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26, 
     20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844, 
     37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98, 
     231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125, 
     NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA, 
     NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA, 
     NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117, 
     5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2, 
     1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132, 
     139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766, 
     7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
 ), class = "data.frame")

I am needing to calculate the proportion of patients who died in the hospital in patients with an active DNR order on day 3 and in patients without an active DNR order on day 3. To group which patients had an active DNR on day 3 and which did not, I used the subset function below:

SB_xlsx1 = SB_xlsx[!is.na(SB_xlsx$dnrday), ]
YesDNR = subset(SB_xlsx1, dnrday <= 3)
NoDNR = subset(SB_xlsx1, dnrday > 3)

However, I don't know how to calculate the proportion of patients that died in the hospital for those with a DNR and without a DNR. The 'hospdead' variable has all 0s and 1s, where 0 = not dead and 1 = dead. However, I don't know how to get the proportion that died for having a DNR at day 3 and did not have a DNR at day 3. What code could I use for my desired result. SB_xlsx also just represents my DataFrame name.

CodePudding user response:

Since your death variable has 0-1 coding, you can use mean() to get what you need:

mean(NoDNR$death)
mean(YesDNR$death)

The logic is that for a Boolean vector x, mean(x) is precisely the proportion of 1's, because it's the sum of the entries (equivalently, the count of the 1's) divided by how many there are.

Note that if you have NA entries in death, you'll want to remove them (otherwise you'll get an NA value for the mean):

mean(NoDNR$death, na.rm = T)
mean(YesDNR$death, na.rm = T)

For an alternate approach that's a bit cleaner and doesn't require creating interim subset data frames (NoDNR, YesDNR), you might consider a dplyr approach:

library(dplyr)
SB_xlsx1 %>%
  group_by(dnrday <= 3) %>%
  summarize(mean(death))

CodePudding user response:

There's a few ways to do this but the simplest is probably via the aggregate function.

> aggregate( hospdead ~ (dnrday<=3) , SB_xlsx1 , mean)

  dnrday <= 3  hospdead
1       FALSE 0.1428571
2        TRUE 0.0000000

CodePudding user response:

You may use tapply to group deaths by the condition dnrday <= 3, i.e. with an active DNR on day 3 and calculate the mean.

(res <- proportions(xtabs(death ~ dnrday <= 3, SB_xlsx)))
# dnrday <= 3
#     FALSE      TRUE 
# 0.7142857 0.2857143 

where

sum(res)
# [1] 1
  • Related