I have the following data:

ID <- c("A", "B", "C", "D", "E", "F")
age <- c(54, 61, 65, 55, 60, 60)
sex <- c(0, 0, 1, 1, 1, 0)
Q1 <- c(0, 0, 0, 0, 0, 0)
Q2 <- c(0, 1, 0, 0, 0, 1) 
Q3 <- c(0, 1, 1, 0, 0, 1) 
Q4 <- c(0, 1, 1, 1, 0, 1) 
Q5 <- c(0, 1, 1, 1, 0, 1)
E1 <- c(2, 1, 0, 0, 0, 0)
E2 <- c(0, 1, 2, 0, 1, 0)
E3 <- c(0, 0, 1, 0, 1, 1)
E4 <- c(1, 0, 0, 0, 0, 0)
E5 <- c(0, 0, 0, 0, 2, 2)
Sint <- c(4, 3, 4, 1, 0, 2)
surv1 <- c(1, 1, 1, 1, 1, 1)
surv2 <- c(1, 1, 0, 1, 1, 1)
surv3 <- c(1, 1, 0, 1, 1, 1)
surv4 <- c(1, 1, 0, 1, 1, 0)
surv5 <- c(1, 1, 0, 1, 0, 0)
surv6 <- c(1, 1, 0, 1, 0, 0)


dta <- data.frame(ID, age, sex, Q1, Q2, Q3, Q4, Q5, E1, E2, E3, E4, E5, Sint,
                  surv1, surv2, surv3, surv4, surv5, surv6)

I created the following arrays:

surv_wave <- c("surv1", "surv2", "surv3", "surv4", "surv5", "surv6")
var_num <- c("age", "sex")
Wave2 <- c("age", "sex", "Q1", "E1", "Sint")
Wave3 <- c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint")
Wave4 <- c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint")
Wave5 <- c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint")
Wave6 <- c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
Waves <- c("Wave2", "Wave3", "Wave4", "Wave5", "Wave6")

And I want to iterate over the arrays to predict probabilities of survival given the variables in the arrays:

# Probability variables that will be predicted
dta$wsd2 <- NA
dta$wsd3 <- NA
dta$wsd4 <- NA
dta$wsd5 <- NA
dta$wsd6 <- NA

# vector of variables that will be predicted
wsurv_den <- c("wsd2", "wsd3", "wsd4", "wsd5", "wsd6")

# iterate all waves
for(i in 2:6) {
  
  # subset people who survived in the previous wave
  Subset <- subset(dta, dta[[surv_wave[i-1]]] == 1)
  
  # logistic regression
  f <- as.formula(
    paste(surv_wave[i], 
          paste(Waves[i], collapse = "   "),
          sep = " ~ "))
  Den_surv_s <- glm(f,  family = binomial(link = "logit"),  
                    data = Subset)
  
  # predict probabilities of survival based on logistic regression
  Den_surv_p_s <- predict(Den_surv_s, type = "response")
  
  # Add predicted values to original dataset
  dta[dta[[surv_wave[i-1]]] == 1,][[wsurv_den[i-1]]]<-Den_surv_p_s
  
}

I keep getting an error message: Error in model.frame.default(formula = f, data = Subset, drop.unused.levels = TRUE) : variable lengths differ (found for 'Wave3')

I looked at possible solutions, but I don't have NA values and the only "Wave3" variable I have in the environment is the array. What am I doing wrong?

CodePudding user response：

You are receiving this error because of the formulas you are passing to glm(). While you are likely expecting paste(Waves[i], collapse = " ") to construct a string pasting together each of the values in the vector with the name returned from Waves[i], your code is pasting the name of the vector itself.

You can fix this by passing Waves[i] to get() first, which will pass the object itself to paste(), rather than the name of the object.: paste(Waves[i], collapse = " ")

What your code does when i = 2 in the loop

f <- as.formula(
  paste(surv_wave[2], 
        paste(Waves[2], collapse = "   "),
        sep = " ~ "))

glm(f,  family = binomial(link = "logit"),  
                  data = dta)
#> Error in model.frame.default(formula = f, data = dta, drop.unused.levels = TRUE): variable 
#> lengths differ (found for 'Wave3')

# You're passing the following formula to glm():
f
#> surv2 ~ Wave3

Passing to get() first passes the formula you want

f <- as.formula(
  paste(surv_wave[2], 
        paste(get(Waves[2]), collapse = "   "),
        sep = " ~ "))

# Here's what you are now passing:
f
#> surv2 ~ age   sex   Q1   Q2   E1   E2   Sint


glm(f,  family = binomial(link = "logit"),  
                  data = dta)
#> 
#> Call:  glm(formula = f, family = binomial(link = "logit"), data = dta)
#> 
#> Coefficients:
#> (Intercept)          age          sex           Q1           Q2           E1  
#>     146.414       -1.965       -3.931           NA       15.722       11.792  
#>          E2         Sint  
#>          NA       -9.826  
#> 
#> Degrees of Freedom: 5 Total (i.e. Null);  0 Residual
#> Null Deviance:       5.407 
#> Residual Deviance: 2.572e-10     AIC: 12

CodePudding user response：

First of all, please use reformulate instead of constipated, nested paste instructions to assemble a formula. reformulate does it in one simple step.

In the code below, Waves is the list in @socialscientist's answer and at end.

for(i in seq_along(surv_wave)[-1]) {
  surv_prev <- dta[[ surv_wave[i - 1L] ]]
  i_surv <- which(surv_prev == 1L)
  srv <- surv_wave[i]
  wv <- Waves[[i - 1L]]
  Subset <- dta[i_surv, ]
  f <- reformulate(wv, srv)
  fit <- glm(f, data = Subset, family = binomial(link = "logit"))
  ypred <- predict(fit, type = "response")
  dta[i_surv, wsurv_den[i - 1L]] <- ypred
}

Data

Waves list.

Waves = list(
  Wave2 =c("age", "sex", "Q1", "E1", "Sint"),
  Wave3 =c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint"),
  Wave4 =c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint"),
  Wave5 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint"),
  Wave6 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
)

CodePudding user response：

I think a couple of changes will help you.

First, create a named list Waves, where each element is a vector your variables for that wave, like this:

Waves = list(
  Wave2 =c("age", "sex", "Q1", "E1", "Sint"),
  Wave3 =c("age", "sex", "Q1", "Q2", "E1", "E2", "Sint"),
  Wave4 =c("age", "sex", "Q1", "Q2", "Q3", "E1", "E2", "E3", "Sint"),
  Wave5 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "E1", "E2", "E3", "E4", "Sint"),
  Wave6 =c("age", "sex", "Q1", "Q2", "Q3", "Q4", "Q5", "E1", "E2", "E3", "E4", "E5", "Sint")
)

Then, update the creation of the formula to reference this list

  f <- as.formula(
    paste(surv_wave[i], "~", paste(Waves[[paste0("Wave",i)]], collapse="   "))
  )