Randomization/ permutation test-CodePudding

I have a panel data that looks like this:

x <- structure(list(id2 = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11), private = c(1, 
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 
0, 0, 0, 0, 1, 1, 1), capex = c(-0.003423963, -0.028064674, -0.03058208, 
-0.00186256, -0.010839419, 0.052905358, 0.058931317, 0.065547734, 
0.007644231, -0.025942514, 0.00163772, -0.007530502, 0.010706151, 
0.025040116, 0.035105374, 0.036772128, 0.03886272, 0.045399148, 
0.042642809, 0.080788992, 0.080848917, 0.057645567, 0.057636742, 
0.046084184, 0.041080192, 0.05690382, 0.057325598, 0.051791377, 
0.070084445, 0.069627948, 0.077849329, 0.077247024, 0.081251919, 
0.071702167, 0.078424804, 0.077482991, 0.078969546, 0.076208547, 
0.059055354, 0.056043826, 0.029450929, 0.016044363, 0.048353843, 
0.047607671, 0.046497576, 0.047454875, 0.050881654, 0.047155183, 
0.055546004, 0.057564467), roa = c(-0.003078416, -0.035302367, 
-0.01884984, 0.002839225, -0.001113289, 0.024291474, 0.040153231, 
0.051482957, 0.026102768, 0.005372915, 0.004466314, -0.025178509, 
-0.002043936, -0.069235161, 0.023604594, 0.010512878, 0.021912357, 
0.016721437, -0.09472625, 0.04061316, 0.074661337, 0.0214584, 
0.047743626, 0.013149141, -0.010418181, 0.025671346, 0.031785361, 
0.084893651, 0.018490626, 0.024941774, 0.023567598, 0.031878859, 
0.029931206, 0.043837443, 0.041305128, 0.041293647, 0.039307728, 
0.046259467, 0.017479861, 0.029429797, 0.023826957, 0.00763736, 
0.017485917, 0.017156925, 0.006504506, 0.021350464, 0.032917287, 
0.036106978, 0.04545372, 0.049348988), year = c(2011, 2012, 2013, 
2014, 2015, 2011, 2012, 2013, 2011, 2012, 2013, 2014, 2015, 2011, 
2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 
2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013, 
2014, 2015, 2011, 2012, 2013, 2014, 2011, 2012, 2013, 2014, 2015, 
2011, 2012, 2013)), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

Where id2 is firm ID and private is an indicator for private/public status. My goal is to run a randomization test for r-squared as follows:

regress roa on capex for private firms (i.e. private==1) and public (private==0) separately and get the observed difference in R-squared
randomly assign firms to private-public status (note that the data is panel)
rerun the regression and get the difference in r-squared for the random sample
repeat this, say, 1000 times
measure the p-value as the number of times that the randomly generated difference in R2 is larger than the observed difference divided by the number of iterations (1,000)

My issue is that this code takes ages to run, it will be great if someone has an idea of a better way to do this.

you will need estimatr and tidyverse packages to run this code

library(estimatr)
library(tidyverse)

# run model 1
mod1 <-lm_robust(roa ~ capex,
                 cluster=id2,
                 se_type = "stata",
                 data=x,private==0)
# run model 2
mod2 <-lm_robust(roa ~ capex,
                 cluster=id2,
                 se_type = "stata",
                 data=x,private==1)

#obtain the observed difference in R2
R2.obs1 <- summary(mod1)$adj.r.squared
R2.obs2 <- summary(mod2)$adj.r.squared
diff_r2_obs <- R2.obs2 - R2.obs1
]

#create a list for the simulated differnce in r2
simulated_r2 <- list()

# prepare the loop 

set.seed(8)
nreps = 1000 

for(i in 1:nreps){
  x1 <- x %>%      # randamize the variable private taking into acount each id appears a number of times 
    distinct(id2, private) %>% 
    mutate(private1=sample(private), replace=T) %>% 
    left_join(x, by="id2")  
  
  model_m <-lm_robust(roa ~ capex,
                      cluster=id2,
                      se_type = "stata",
                      data = x1,
                      subset=private1==0)
  R2.obs_m <- summary(model_m)$adj.r.squared
  
  model_f <-lm_robust(roa ~ capex,
                      cluster=id2,
                      se_type = "stata",
                      data = x1,
                      subset=private1==1)
  
  R2.obs_f <- summary(model_f)$adj.r.squared
  r2_diff_sim <- R2.obs_f - R2.obs_m
  simulated_r2[i] <- r2_diff_sim
  
}

simulated_r2 <- unlist(simulated_r2)

exceed_count <- length(simulated_r2[simulated_r2 >=
                                          diff_r2_obs])
p_val <- exceed_count / nreps

p_val

CodePudding user response：

I initialized simulated_r2 as a vector rather than a list. We benefit a lot when we predefine it's length=. Also I replaced the dplyr code with an transform approach which appears to be faster. Runs quite fast with 1000 reps on this data actually. (Note, that in your code there was a mistake when using sample, the replace= argument is outside the call but should be inside!)

nreps <- 1000
simulated_r2 <- numeric(length=nreps)

set.seed(8)
system.time(
  for (i in seq_len(nreps)) {
    x1 <- transform(subset(x, !duplicated(id2)), 
                    private=sample(private, replace=T))
    model_m <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
                                   data=x1, subset=private == 0)
    R2.obs_m <- summary(model_m)$adj.r.squared
    model_f <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata", 
                                   data=x1, subset=private == 1)
    R2.obs_f <- summary(model_f)$adj.r.squared
    r2_diff_sim <- R2.obs_f - R2.obs_m
    simulated_r2[i] <- r2_diff_sim
  } 
)
#  user  system elapsed 
# 8.262   0.000   8.267 

(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.273

Note, that warnings occur if you set replace=TRUE, probably when the nobs of one of the subsets on 0/1 get to small. You should rethink your approach a little.

Parallel

If this is too slow, you could parallelize the code.

library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, 'x')

nreps <- 10000

set.seed(8)
system.time(
  simulated_r2 <- parSapplyLB(cl, seq_len(nreps), \(i) {
    x1 <- transform(subset(x, !duplicated(id2)), 
                    private=sample(private, replace=F))
    model_m <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
                                   data=x1, subset=private == 0)
    R2.obs_m <- summary(model_m)$adj.r.squared
    model_f <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata", 
                                   data=x1, subset=private == 1)
    R2.obs_f <- summary(model_f)$adj.r.squared
    R2.obs_f - R2.obs_m
  })
)
#  user  system elapsed 
# 0.067   0.058  17.820 

(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.244

CodePudding user response：

This takes a different approach to jay.sf's nice solution. Here, I use data.table, create the random permutations of private/public status, use a small function to get difference(s) in r-sq:

library(data.table)
setDT(x)

set.seed(8)

nperms=1000

# get permutations of public/private status
perms = cbind(unique(x[,.(id2)]), unique(x[,.(id2,private)])[,lapply(1:nperms, function(p) sample(private, replace=F))])

# function to get R-sq differences
obsR <- function(r,c,id,p) {
  r0 = lm_robust(r[p==0]~c[p==0],cluster=id[p==0],se_type="stata")$adj.r.squared
  r1 = lm_robust(r[p==1]~c[p==1],cluster=id[p==1],se_type="stata")$adj.r.squared
  r1-r0
}

# empirical r-sq diff
empirR = x[, obsR(roa,capex,id2,private)]

# simulated r-sq differences
simR = x[perms, on=.(id2)][, sapply(.SD, function(v) obsR(roa,capex,id2,v)), .SDcols = patterns("V")]

# pvalue
sum(simR>=empirR)/nperms

With the small provided dataset, this can do a thousand estimates and get the p-value in under 5 seconds on my machine.

   user  system elapsed 
   4.56    0.14    4.70