How to translate values in a column to "yes" and "no" values for a multiple regr-CodePudding

I am doing a multiple linear regression with the following reproducible dataset (this is a small sample of my data):

structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498, 
 79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897), 
     death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male", 
     "female", "female", "female", "female", "male", "male", "male", 
     "male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0, 
     0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029, 
     4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer", 
     "Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis", 
     "Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer", 
     "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF", 
     "Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", 
     "Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11, 
     12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k", 
     "$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k", 
     NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26, 
     26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884, 
     30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
     ), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
     ), race = c("other", "white", "white", "white", "white", 
     "white", "white", "white", "black", "hispanic"), sps = c(33.8984375, 
     52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875, 
     21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19, 
     30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275, 
     0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336, 
     0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341, 
     0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852, 
     0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4, 
     1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1, 
     0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic", 
     "no", "no", "metastatic", "no", "no", "no", "no", "metastatic", 
     "metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619, 
     0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0, 
     0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999, 
     NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr", 
     "no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"), 
     dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97, 
     43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562, 
     8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094, 
     9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101, 
     120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26, 
     20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844, 
     37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98, 
     231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125, 
     NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA, 
     NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA, 
     NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117, 
     5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2, 
     1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132, 
     139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766, 
     7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_, 
     NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
     NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)", 
     "<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7, 
     1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
 ), class = "data.frame")

I have my formula for the regression here.

SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$dnrday), ]
SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$sps), ]
MLR_2 = lm(SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f   SB_xlsx13$age   SB_xlsx13$sex   SB_xlsx13$num.co   SB_xlsx13$sps)
summary(MLR_2)
## 
## Call:
## lm(formula = SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f   SB_xlsx13$age   
##     SB_xlsx13$sex   SB_xlsx13$num.co   SB_xlsx13$sps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26132 -0.25758 -0.08914  0.15412  1.14048 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                           -0.3519553  0.0224017 -15.711  < 2e-16
## SB_xlsx13$dzclass_fCancer             -0.0870012  0.0123327  -7.055 1.86e-12
## SB_xlsx13$dzclass_fComa                0.2907825  0.0164644  17.661  < 2e-16
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis -0.1378731  0.0104787 -13.157  < 2e-16
## SB_xlsx13$age                          0.0027082  0.0002555  10.598  < 2e-16
## SB_xlsx13$sexmale                      0.0022789  0.0079126   0.288    0.773
## SB_xlsx13$num.co                       0.0028155  0.0032577   0.864    0.387
## SB_xlsx13$sps                          0.0184986  0.0004393  42.105  < 2e-16
##                                          
## (Intercept)                           ***
## SB_xlsx13$dzclass_fCancer             ***
## SB_xlsx13$dzclass_fComa               ***
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis ***
## SB_xlsx13$age                         ***
## SB_xlsx13$sexmale                        
## SB_xlsx13$num.co                         
## SB_xlsx13$sps                         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3724 on 9067 degrees of freedom
## Multiple R-squared:  0.2772, Adjusted R-squared:  0.2767 
## F-statistic: 496.8 on 7 and 9067 DF,  p-value: < 2.2e-16

The regression comes out just fine; however, I want to add one more variable(s), which is dnr status at day three. If the value is 3 or less, there is a DNR and if the value is over 3, there is not a DNR. I previously subsetted these values for a previous task using this code:

YesDNR <- subset(SB_xlsx12, dnrday <= 3, na.rm=TRUE)
NoDNR <- subset(SB_xlsx12, dnrday > 3, na.rm=TRUE)

This worked fine, but I can't really use these subsets in my regression model. I would assume to make the model work, I would need to translate every value of 3 or less (<=) in the "dnrday" column to "yes" and every value over 3 (>) to "no". Am I correct on this thinking, and if so, how would I accomplish changing those values.

CodePudding user response：

I would create a new column - see two options below.

(NB in lm() you don't have to specify SB_xlsx13$ each time you add a covariate if you list it as the data = argument once! This will make your output easier to read.)

Tidyverse approach: mutate and case_when:

library(dplyr)
SB_xlsx13 <- SB_xlsx13 %>%
  mutate(dnr_d3 = case_when(dnrday <= 3 ~ "yes",
                            dnrday > 3 ~ "no",
                            TRUE ~ NA_character_))

MLR_3 <- lm(hospdead ~ dzclass   age   sex   num.co   sps   dnr_d3,
            data = SB_xlsx13)

Base R approach:

SB_xlsx13$dnr_d3[SB_xlsx13$dnrday <= 3] <- "yes"
SB_xlsx13$dnr_d3[SB_xlsx13$dnrday > 3] <- "no"
MLR_4 <- lm(hospdead ~ dzclass   age   sex   num.co   sps   dnr_d3,
            data = SB_xlsx13)

CodePudding user response：

You may simply use the AsIs function I() in the formula. Also use factor to easily factorize variables.

lm(hospdead ~ factor(dzclass)   I(dnrday <= 3)   age   sex   num.co   sps, 
   SB_xlsx13)

This might look a bit unclean but it's nice for playing around with the data. Once you are happy with something you may easily change it in the data using transform. For instance, the DNR variable can be created boolean just using dnrday <= 3.

SB_xlsx13 <- transform(SB_xlsx13,
                       dzclass_f=as.factor(dzclass),
                       dnrday_3=dnrday <= 3)