Using information from two different data frames (regression lm function)-CodePudding

Currently i have two data frames that would look something like this:

df1 <- as.data.frame(rbind(c("Name1", "Name2"), c("Name2", "Name3"), c("Name4", "Name5"), c("Name4", "Name3")))
df2 <- as.data.frame(cbind(c(153, 157, 167, 163, 165), c(132, 127, 130, 132, 134), c(72, 83, 85, 90, 86), c(240, 238, 245, 247, 250), c(121, 125, 130, 128, 132)))
colnames(df2) <- c("Name1", "Name2", "Name3", "Name4", "Name5")

df1 contains the information on what pairs i want to regress upon each other, while df2 contains the values to be regressed upon. So I am trying to do something like this:

output_all = NULL
for (i in 1:nrow(df1)) {
  output <- lm(print(df1[i,1], quote = FALSE) ~ print(df1[i,2], quote = FALSE), data = df2)
  b_0 <- leastsqr_output[[1]]
  b_1 <- leastsqr_output[[2]]
  output_all <- cbind(b_0, b_1)
}

Which produces error:

Error in `contrasts<-`(`*tmp*`, value = contr.funs[1   isOF[nn]]) : 
  contrasts can be applied only to factors with 2 or more levels
In addition: Warning message:
In storage.mode(v) <- "double" : NAs introduced by coercion

Now, I believe my issue lies in the lm function, as i produce the same error from:

output <- lm(print(df1[1,1], quote = FALSE) ~ print(df1[1,2], quote = FALSE), data = df2)

I have tried changing the print with other functions such as cat, as I believe the issue may be in how the name is printed.

Is there a way to make this work?

CodePudding user response：

Use a apply loop and reformulate instead of a for loop.
After the regressions, there is code to get the summaries and the coefficients.

df1 <- as.data.frame(rbind(c("Name1", "Name2"), c("Name2", "Name3"), c("Name4", "Name5"), c("Name4", "Name3")))
df2 <- as.data.frame(cbind(c(153, 157, 167, 163, 165), c(132, 127, 130, 132, 134), c(72, 83, 85, 90, 86), c(240, 238, 245, 247, 250), c(121, 125, 130, 128, 132)))
colnames(df2) <- c("Name1", "Name2", "Name3", "Name4", "Name5") 

output_all <- apply(df1, 1, \(x, data) {
  fmla <- reformulate(x[2], x[1])
  lm(fmla, data = data)
}, data = df2)

smry_list <- lapply(output_all, summary)
coef_list <- lapply(output_all, coef)

output_all[[1]]
#> 
#> Call:
#> lm(formula = fmla, data = data[x])
#> 
#> Coefficients:
#> (Intercept)        Name2  
#>     86.1429       0.5714

smry_list[[1]]
#> 
#> Call:
#> lm(formula = fmla, data = data[x])
#> 
#> Residuals:
#>      1      2      3      4      5 
#> -8.571 -1.714  6.571  1.429  2.286 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  86.1429   161.0126   0.535    0.630
#> Name2         0.5714     1.2289   0.465    0.674
#> 
#> Residual standard error: 6.503 on 3 degrees of freedom
#> Multiple R-squared:  0.06723,    Adjusted R-squared:  -0.2437 
#> F-statistic: 0.2162 on 1 and 3 DF,  p-value: 0.6736

coef_list
#> [[1]]
#> (Intercept)       Name2 
#>  86.1428571   0.5714286 
#> 
#> [[2]]
#>  (Intercept)        Name3 
#> 129.63457330   0.01641138 
#> 
#> [[3]]
#> (Intercept)       Name5 
#> 121.5614973   0.9625668 
#> 
#> [[4]]
#> (Intercept)       Name3 
#> 205.3129103   0.4649891

^{Created on 2022-05-07 by the reprex package (v2.0.1)}

Edit

In order to run the regressions with a for loop, create a empty list with as many members as regressions to run, then use a code equivalent to the apply code above, with reformulate.
The results are nearly equal, with differences in the name of the data set.

output_all2 <- vector("list", length = nrow(df1))
for(i in seq_len(nrow(df1))) {
  fmla <- reformulate(df1[i, 2], df1[i, 1])
  output_all2[[i]] <- lm(fmla, data = df2)
}

all.equal(output_all, output_all2)
#> [1] "Component 1: Component 10: target, current do not match when deparsed"
#> [2] "Component 2: Component 10: target, current do not match when deparsed"
#> [3] "Component 3: Component 10: target, current do not match when deparsed"
#> [4] "Component 4: Component 10: target, current do not match when deparsed"

output_all[[1]][[10]]
#> lm(formula = fmla, data = data)
output_all2[[1]][[10]]
#> lm(formula = fmla, data = df2)

^{Created on 2022-05-07 by the reprex package (v2.0.1)}

CodePudding user response：

I have tried changing the print with other functions such as cat, as I believe the issue may be in how the name is printed.

I think you are looking for as.formula combined with paste. Here I try to follow your steps as close as possible with some modification:

output <- vector('list', nrow(df1))
b_0 <- numeric(nrow(df1))
b_1 <- numeric(nrow(df1))
for (i in 1:nrow(df1)) {
       output[[i]] <- lm(as.formula(paste(df1[i,1], "~" ,df1[i,2])), data = df2)
       b_0[i] <- coef(output[[i]])[1]
       b_1[i] <- coef(output[[i]])[2]
   }
output_all <- cbind(b_0, b_1)

The results:

output_all
#        b_0        b_1
# [1,]  86.14286 0.57142857
# [2,] 129.63457 0.01641138
# [3,] 121.56150 0.96256684
# [4,] 205.31291 0.46498906