How to perform multiple linear model fits of one column against all (pairwise) using dplyr and broom-CodePudding

Given the mtcars data:

> head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

What I want to do is to perform linear fit with y = mpg as the outcome with all other variables as predictors x , in one on one basis (i.e. from mpg ~ cyl up to mpg ~ carb).

How can I do that with dplyr?

So far I have this code (as suggested by Maurits):

 library(broom)
  library(tidyr)
  library(dplyr)
 lm(mpg ~ ., data = mtcars) %>% 
 # glance() # this fail
 tidy()

It gave me this result:

# A tibble: 11 × 5
   term        estimate std.error statistic p.value
   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)  12.3      18.7        0.657  0.518 
 2 cyl          -0.111     1.05      -0.107  0.916 
 3 disp          0.0133    0.0179     0.747  0.463 
 4 hp           -0.0215    0.0218    -0.987  0.335 
 5 drat          0.787     1.64       0.481  0.635 
 6 wt           -3.72      1.89      -1.96   0.0633
 7 qsec          0.821     0.731      1.12   0.274 
 8 vs            0.318     2.10       0.151  0.881 
 9 am            2.52      2.06       1.23   0.234 
10 gear          0.655     1.49       0.439  0.665 
11 carb         -0.199     0.829     -0.241  0.812

But when I do

> tidy(lm(mpg ~ cyl, data = mtcars))
# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    37.9      2.07      18.3  8.37e-18
2 cyl            -2.88     0.322     -8.92 6.11e-10

Notice the difference in estimate from -2.88 to -0.11.

CodePudding user response：

You can use the following code

data(mtcars)

x = names(mtcars[,-1])
out <- unlist(lapply(1, function(n) combn(x, 1, FUN=function(row) paste0("mpg ~ ", paste0(row, collapse = " ")))))
out

#>  [1] "mpg ~ cyl"  "mpg ~ disp" "mpg ~ hp"   "mpg ~ drat" "mpg ~ wt"  
#>  [6] "mpg ~ qsec" "mpg ~ vs"   "mpg ~ am"   "mpg ~ gear" "mpg ~ carb"

library(broom)
library(dplyr)

#To have the regression coefficients
tmp1 = bind_rows(lapply(out, function(frml) {
  a = tidy(lm(frml, data=mtcars))
  a$frml = frml
  return(a)
}))
head(tmp1)

#To have the regression results i.e. R2, AIC, BIC
tmp2 = bind_rows(lapply(out, function(frml) {
  a = glance(lm(frml, data=mtcars))
  a$frml = frml
  return(a)
}))
head(tmp2)

write.csv(tmp1, "Try_lm_coefficients.csv")
write.csv(tmp2, "Try_lm_results.csv")

CodePudding user response：

Here is another option

library(tidyverse)
library(broom)
idx_var <- which(!names(mtcars) %in% "mpg") %>% set_names(names(mtcars)[.])
idx_res <- which(names(mtcars) %in% "mpg")

map_dfr(
    idx_var, 
    ~ tidy(lm(mtcars[[idx_res]] ~ mtcars[[.x]])),
    .id = "variable")
## A tibble: 20 x 6
#   variable term         estimate std.error statistic  p.value
#   <chr>    <chr>           <dbl>     <dbl>     <dbl>    <dbl>
# 1 cyl      (Intercept)   37.9      2.07       18.3   8.37e-18
# 2 cyl      mtcars[[.x]]  -2.88     0.322      -8.92  6.11e-10
# 3 disp     (Intercept)   29.6      1.23       24.1   3.58e-21
# 4 disp     mtcars[[.x]]  -0.0412   0.00471    -8.75  9.38e-10
# 5 hp       (Intercept)   30.1      1.63       18.4   6.64e-18
# 6 hp       mtcars[[.x]]  -0.0682   0.0101     -6.74  1.79e- 7
# 7 drat     (Intercept)   -7.52     5.48       -1.37  1.80e- 1
# 8 drat     mtcars[[.x]]   7.68     1.51        5.10  1.78e- 5
# 9 wt       (Intercept)   37.3      1.88       19.9   8.24e-19
#10 wt       mtcars[[.x]]  -5.34     0.559      -9.56  1.29e-10
#...