How to iterate over many dependent and independent variables to produce a list of regressions using-CodePudding

DATA BELOW

library(fixest)

analysis<-tibble(off_race = c("hispanic", "hispanic", "white","white", "hispanic", "white", "hispanic", "white", "white", "white","hispanic"), any_black_uof = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), any_black_arrest = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), prop_white_scale = c(0.866619646027524, -1.14647499712298, 1.33793539994219, 0.593565300512359, -0.712819809606193, 0.3473585867755, -1.37025501425243, 1.16596624239715, 0.104521426674564, 0.104521426674564, -1.53728347122581),  prop_hisp_scale=c(-0.347382203637802, 1.54966785579018, 
    -0.833021026477168, -0.211470492567308, 1.48353691981021, 
    0.421968013870802, 2.63739845069911, -0.61002505397242, 0.66674880256898,0.66674880256898, 2.93190487813111))

I would like to run a series of regressions that iterate over these vectors

officer_race = c("black", "white", "hispanic")
primary_ind<-c("prop_white_scale","prop_hisp_scale","prop_black_scale")
outcome<-c("any_black_uof","any_white_uof","any_hisp_uof","any_black_arrest","any_white_arrest","any_hisp_arrest","any_black_stop","any_white_stop","any_hisp_stop")

Also of note, I would like to use the fixest package where the regressions would look like this

feols(any_black_uof~ prop_white_scale,data=analysis[analysis$off_race =="black"])
feols(any_black_uof~ prop_white_scale,data=analysis[analysis$off_race =="white"])
feols(any_black_uof~ prop_white_scale,data=analysis[analysis$off_race=="hispanic"])
feols(any_black_uof~ prop_hisp_scale,data=analysis[analysis$off_race =="black"])
feols(any_black_uof~ prop_hisp_scale,data=analysis[analysis$off_race =="white"])

etc. iterating through all possible combinations and creating a list of fixest regression objects.

I tried this and it works with the lm() function. Yet, it breaks with feols.



result <- lapply(split(analysis, analysis$off_race), function(x) {
  sapply(primary_ind2, function(y) {
    sapply(outcome2, function(z) {
      feols(paste(y, z, sep = "~"), x)
    }, simplify = FALSE)
  }, simplify = FALSE)
})

CodePudding user response：

The reason lm() works is because we can supply formulas as strings (probably because its such a common mistake to make?) so it works with paste().

Usually we use reformulate() to create formulas based on strings and with this feols() will work.

I changed your model input data a little because we only have a fraction of your data and your model inputs contain column names that are not present in the example data.

library(fixest)
library(tibble)

officer_race <- c("black", "white", "hispanic")
primary_ind <-c("prop_white_scale","prop_hisp_scale")
outcome <- c("any_black_uof","any_black_arrest")


result <- lapply(split(analysis, analysis$off_race), function(x) {
  sapply(primary_ind, function(y) {
    sapply(outcome, function(z) {
      feols(reformulate(y, z), x)
    }, simplify = FALSE)
  }, simplify = FALSE)
})
#> The variable 'any_black_uofTRUE' has been removed because of collinearity (see $collin.var).
#> The variable 'any_black_uofTRUE' has been removed because of collinearity (see $collin.var).
#> The variable 'any_black_uofTRUE' has been removed because of collinearity (see $collin.var).
#> The variable 'any_black_arrestTRUE' has been removed because of collinearity (see $collin.var).
#> The variable 'any_black_uofTRUE' has been removed because of collinearity (see $collin.var).
#> The variable 'any_black_arrestTRUE' has been removed because of collinearity (see $collin.var).

result
#> $hispanic
#> $hispanic$prop_white_scale
#> $hispanic$prop_white_scale$any_black_uof
#> OLS estimation, Dep. Var.: prop_white_scale
#> Observations: 5 
#> Standard-errors: IID 
#>              Estimate Std. Error  t value Pr(>|t|) 
#> (Intercept) -0.780043   0.434284 -1.79616  0.14689 
#> ... 1 variable was removed because of collinearity (any_black_uofTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.868568            
#> 
#> $hispanic$prop_white_scale$any_black_arrest
#> OLS estimation, Dep. Var.: prop_white_scale
#> Observations: 5 
#> Standard-errors: IID 
#>                      Estimate Std. Error  t value  Pr(>|t|)    
#> (Intercept)          -1.19171   0.178578 -6.67332 0.0068613 ** 
#> any_black_arrestTRUE  2.05833   0.399313  5.15468 0.0141561 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.276652   Adj. R2: 0.864731
#> 
#> 
#> $hispanic$prop_hisp_scale
#> $hispanic$prop_hisp_scale$any_black_uof
#> OLS estimation, Dep. Var.: prop_hisp_scale
#> Observations: 5 
#> Standard-errors: IID 
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  1.65103   0.576435  2.8642 0.045735 *  
#> ... 1 variable was removed because of collinearity (any_black_uofTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 1.15287            
#> 
#> $hispanic$prop_hisp_scale$any_black_arrest
#> OLS estimation, Dep. Var.: prop_hisp_scale
#> Observations: 5 
#> Standard-errors: IID 
#>                      Estimate Std. Error  t value Pr(>|t|)    
#> (Intercept)           2.15063   0.371203  5.79366 0.010230 *  
#> any_black_arrestTRUE -2.49801   0.830036 -3.00952 0.057234 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.575066   Adj. R2: 0.668248
#> 
#> 
#> 
#> $white
#> $white$prop_white_scale
#> $white$prop_white_scale$any_black_uof
#> OLS estimation, Dep. Var.: prop_white_scale
#> Observations: 6 
#> Standard-errors: IID 
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.608978   0.217505 2.79984 0.038001 *  
#> ... 1 variable was removed because of collinearity (any_black_uofTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.486355            
#> 
#> $white$prop_white_scale$any_black_arrest
#> OLS estimation, Dep. Var.: prop_white_scale
#> Observations: 6 
#> Standard-errors: IID 
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.608978   0.217505 2.79984 0.038001 *  
#> ... 1 variable was removed because of collinearity (any_black_arrestTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.486355            
#> 
#> 
#> $white$prop_hisp_scale
#> $white$prop_hisp_scale$any_black_uof
#> OLS estimation, Dep. Var.: prop_hisp_scale
#> Observations: 6 
#> Standard-errors: IID 
#>             Estimate Std. Error  t value Pr(>|t|) 
#> (Intercept) 0.016825   0.269335 0.062468  0.95261 
#> ... 1 variable was removed because of collinearity (any_black_uofTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.602251            
#> 
#> $white$prop_hisp_scale$any_black_arrest
#> OLS estimation, Dep. Var.: prop_hisp_scale
#> Observations: 6 
#> Standard-errors: IID 
#>             Estimate Std. Error  t value Pr(>|t|) 
#> (Intercept) 0.016825   0.269335 0.062468  0.95261 
#> ... 1 variable was removed because of collinearity (any_black_arrestTRUE)
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> RMSE: 0.602251

^{Created on 2022-10-19 by the reprex package (v2.0.1)}

CodePudding user response：

Here is an option

library(dplyr)
library(purrr)
library(fixest)
library(tidyr)
fmla_dat <- crossing(outcome, primary_ind) %>% 
   transmute(fmla = map2(primary_ind, outcome, ~ reformulate(.x, response = .y)))
analysis %>%
    group_by(off_race) %>%
    summarise(out = map(fmla_dat$fmla, 
    ~ possibly(feols, otherwise = NA)(.x, data = cur_data())), .groups = 'drop')