I have the following data:
ID <- c("A", "B", "C", "D", "E", "F")
age <- c(54, 61, 65, 55, 60, 60)
sex <- c(0, 0, 1, 1, 1, 0)
Q1 <- c(0, 0, 0, 0, 0, 0)
Q2 <- c(0, 1, 0, 0, 0, 1)
Q3 <- c(0, 1, 1, 0, 0, 1)
Q4 <- c(0, 1, 1, 1, 0, 1)
Q5 <- c(0, 1, 1, 1, 0, 1)
E1 <- c(2, 1, 0, 0, 0, 0)
E2 <- c(0, 1, 2, 0, 1, 0)
E3 <- c(0, 0, 1, 0, 1, 1)
E4 <- c(1, 0, 0, 0, 0, 0)
E5 <- c(0, 0, 0, 0, 2, 2)
Sint <- c(4, 3, 4, 1, 0, 2)
surv1 <- c(1, 1, 1, 1, 1, 1)
surv2 <- c(1, 1, 0, 1, 1, 1)
surv3 <- c(1, 1, 0, 1, 1, 1)
surv4 <- c(1, 1, 0, 1, 1, 0)
surv5 <- c(1, 1, 0, 1, 0, 0)
surv6 <- c(1, 1, 0, 1, 0, 0)
dta <- data.frame(ID, age, sex, Q1, Q2, Q3, Q4, Q5, E1, E2, E3, E4, E5, Sint,
surv1, surv2, surv3, surv4, surv5, surv6)
I created the following arrays:
surv_wave <- c("surv1", "surv2", "surv3", "surv4", "surv5", "surv6")
var_num <- c("age", "sex")
Wave2 <- c("age", "sex", "Q1", "E1", "Sint")
Wave3 <- c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint")
Wave4 <- c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint")
Wave5 <- c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint")
Wave6 <- c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
Waves <- c("Wave2", "Wave3", "Wave4", "Wave5", "Wave6")
And I want to iterate over the arrays to predict probabilities of survival given the variables in the arrays:
# Probability variables that will be predicted
dta$wsd2 <- NA
dta$wsd3 <- NA
dta$wsd4 <- NA
dta$wsd5 <- NA
dta$wsd6 <- NA
# vector of variables that will be predicted
wsurv_den <- c("wsd2", "wsd3", "wsd4", "wsd5", "wsd6")
# iterate all waves
for(i in 2:6) {
# subset people who survived in the previous wave
Subset <- subset(dta, dta[[surv_wave[i-1]]] == 1)
# logistic regression
f <- as.formula(
paste(surv_wave[i],
paste(Waves[i], collapse = " "),
sep = " ~ "))
Den_surv_s <- glm(f, family = binomial(link = "logit"),
data = Subset)
# predict probabilities of survival based on logistic regression
Den_surv_p_s <- predict(Den_surv_s, type = "response")
# Add predicted values to original dataset
dta[dta[[surv_wave[i-1]]] == 1,][[wsurv_den[i-1]]]<-Den_surv_p_s
}
I keep getting an error message: Error in model.frame.default(formula = f, data = Subset, drop.unused.levels = TRUE) : variable lengths differ (found for 'Wave3')
I looked at possible solutions, but I don't have NA values and the only "Wave3" variable I have in the environment is the array. What am I doing wrong?
CodePudding user response:
You are receiving this error because of the formulas you are passing to glm()
. While you are likely expecting paste(Waves[i], collapse = " ")
to construct a string pasting together each of the values in the vector with the name returned from Waves[i]
, your code is pasting the name of the vector itself.
You can fix this by passing Waves[i]
to get()
first, which will pass the object itself to paste()
, rather than the name of the object.: paste(Waves[i], collapse = " ")
What your code does when i = 2 in the loop
f <- as.formula(
paste(surv_wave[2],
paste(Waves[2], collapse = " "),
sep = " ~ "))
glm(f, family = binomial(link = "logit"),
data = dta)
#> Error in model.frame.default(formula = f, data = dta, drop.unused.levels = TRUE): variable
#> lengths differ (found for 'Wave3')
# You're passing the following formula to glm():
f
#> surv2 ~ Wave3
Passing to get() first passes the formula you want
f <- as.formula(
paste(surv_wave[2],
paste(get(Waves[2]), collapse = " "),
sep = " ~ "))
# Here's what you are now passing:
f
#> surv2 ~ age sex Q1 Q2 E1 E2 Sint
glm(f, family = binomial(link = "logit"),
data = dta)
#>
#> Call: glm(formula = f, family = binomial(link = "logit"), data = dta)
#>
#> Coefficients:
#> (Intercept) age sex Q1 Q2 E1
#> 146.414 -1.965 -3.931 NA 15.722 11.792
#> E2 Sint
#> NA -9.826
#>
#> Degrees of Freedom: 5 Total (i.e. Null); 0 Residual
#> Null Deviance: 5.407
#> Residual Deviance: 2.572e-10 AIC: 12
CodePudding user response:
First of all, please use reformulate
instead of constipated, nested paste
instructions to assemble a formula. reformulate
does it in one simple step.
In the code below, Waves
is the list in @socialscientist's answer and at end.
for(i in seq_along(surv_wave)[-1]) {
surv_prev <- dta[[ surv_wave[i - 1L] ]]
i_surv <- which(surv_prev == 1L)
srv <- surv_wave[i]
wv <- Waves[[i - 1L]]
Subset <- dta[i_surv, ]
f <- reformulate(wv, srv)
fit <- glm(f, data = Subset, family = binomial(link = "logit"))
ypred <- predict(fit, type = "response")
dta[i_surv, wsurv_den[i - 1L]] <- ypred
}
Data
Waves
list.
Waves = list(
Wave2 =c("age", "sex", "Q1", "E1", "Sint"),
Wave3 =c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint"),
Wave4 =c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint"),
Wave5 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint"),
Wave6 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
)
CodePudding user response:
I think a couple of changes will help you.
- First, create a named list
Waves
, where each element is a vector your variables for that wave, like this:
Waves = list(
Wave2 =c("age", "sex", "Q1", "E1", "Sint"),
Wave3 =c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint"),
Wave4 =c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint"),
Wave5 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint"),
Wave6 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
)
- Then, update the creation of the formula to reference this list
f <- as.formula(
paste(surv_wave[i], "~", paste(Waves[[paste0("Wave",i)]], collapse=" "))
)