I have a panel data that looks like this:
x <- structure(list(id2 = c(1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11), private = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 1, 1, 1), capex = c(-0.003423963, -0.028064674, -0.03058208,
-0.00186256, -0.010839419, 0.052905358, 0.058931317, 0.065547734,
0.007644231, -0.025942514, 0.00163772, -0.007530502, 0.010706151,
0.025040116, 0.035105374, 0.036772128, 0.03886272, 0.045399148,
0.042642809, 0.080788992, 0.080848917, 0.057645567, 0.057636742,
0.046084184, 0.041080192, 0.05690382, 0.057325598, 0.051791377,
0.070084445, 0.069627948, 0.077849329, 0.077247024, 0.081251919,
0.071702167, 0.078424804, 0.077482991, 0.078969546, 0.076208547,
0.059055354, 0.056043826, 0.029450929, 0.016044363, 0.048353843,
0.047607671, 0.046497576, 0.047454875, 0.050881654, 0.047155183,
0.055546004, 0.057564467), roa = c(-0.003078416, -0.035302367,
-0.01884984, 0.002839225, -0.001113289, 0.024291474, 0.040153231,
0.051482957, 0.026102768, 0.005372915, 0.004466314, -0.025178509,
-0.002043936, -0.069235161, 0.023604594, 0.010512878, 0.021912357,
0.016721437, -0.09472625, 0.04061316, 0.074661337, 0.0214584,
0.047743626, 0.013149141, -0.010418181, 0.025671346, 0.031785361,
0.084893651, 0.018490626, 0.024941774, 0.023567598, 0.031878859,
0.029931206, 0.043837443, 0.041305128, 0.041293647, 0.039307728,
0.046259467, 0.017479861, 0.029429797, 0.023826957, 0.00763736,
0.017485917, 0.017156925, 0.006504506, 0.021350464, 0.032917287,
0.036106978, 0.04545372, 0.049348988), year = c(2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2011, 2012, 2013, 2014, 2015, 2011,
2012, 2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012,
2013, 2014, 2015, 2011, 2012, 2013, 2014, 2015, 2011, 2012, 2013,
2014, 2015, 2011, 2012, 2013, 2014, 2011, 2012, 2013, 2014, 2015,
2011, 2012, 2013)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
Where id2
is firm ID and private
is an indicator for private/public status. My goal is to run a randomization test for r-squared as follows:
- regress
roa
oncapex
for private firms (i.e. private==1) and public (private==0) separately and get the observed difference in R-squared - randomly assign firms to private-public status (note that the data is panel)
- rerun the regression and get the difference in r-squared for the random sample
- repeat this, say, 1000 times
- measure the p-value as the number of times that the randomly generated difference in R2 is larger than the observed difference divided by the number of iterations (1,000)
My issue is that this code takes ages to run, it will be great if someone has an idea of a better way to do this.
you will need estimatr
and tidyverse
packages to run this code
library(estimatr)
library(tidyverse)
# run model 1
mod1 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==0)
# run model 2
mod2 <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data=x,private==1)
#obtain the observed difference in R2
R2.obs1 <- summary(mod1)$adj.r.squared
R2.obs2 <- summary(mod2)$adj.r.squared
diff_r2_obs <- R2.obs2 - R2.obs1
]
#create a list for the simulated differnce in r2
simulated_r2 <- list()
# prepare the loop
set.seed(8)
nreps = 1000
for(i in 1:nreps){
x1 <- x %>% # randamize the variable private taking into acount each id appears a number of times
distinct(id2, private) %>%
mutate(private1=sample(private), replace=T) %>%
left_join(x, by="id2")
model_m <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <-lm_robust(roa ~ capex,
cluster=id2,
se_type = "stata",
data = x1,
subset=private1==1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
simulated_r2 <- unlist(simulated_r2)
exceed_count <- length(simulated_r2[simulated_r2 >=
diff_r2_obs])
p_val <- exceed_count / nreps
p_val
CodePudding user response:
I initialized simulated_r2
as a vector rather than a list. We benefit a lot when we predefine it's length=
. Also I replaced the dplyr
code with an transform
approach which appears to be faster. Runs quite fast with 1000 reps on this data actually. (Note, that in your code there was a mistake when using sample
, the replace=
argument is outside the call but should be inside!)
nreps <- 1000
simulated_r2 <- numeric(length=nreps)
set.seed(8)
system.time(
for (i in seq_len(nreps)) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=T))
model_m <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
r2_diff_sim <- R2.obs_f - R2.obs_m
simulated_r2[i] <- r2_diff_sim
}
)
# user system elapsed
# 8.262 0.000 8.267
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.273
Note, that warnings occur if you set replace=TRUE
, probably when the nobs of one of the subsets on 0/1 get to small. You should rethink your approach a little.
Parallel
If this is too slow, you could parallelize the code.
library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, 'x')
nreps <- 10000
set.seed(8)
system.time(
simulated_r2 <- parSapplyLB(cl, seq_len(nreps), \(i) {
x1 <- transform(subset(x, !duplicated(id2)),
private=sample(private, replace=F))
model_m <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
data=x1, subset=private == 0)
R2.obs_m <- summary(model_m)$adj.r.squared
model_f <- estimatr::lm_robust(roa ~ capex, cluster=id2, se_type="stata",
data=x1, subset=private == 1)
R2.obs_f <- summary(model_f)$adj.r.squared
R2.obs_f - R2.obs_m
})
)
# user system elapsed
# 0.067 0.058 17.820
(p_val <- length(simulated_r2[simulated_r2 >= diff_r2_obs])/nreps)
# [1] 0.244
CodePudding user response:
This takes a different approach to jay.sf's nice solution. Here, I use data.table, create the random permutations of private/public status, use a small function to get difference(s) in r-sq:
library(data.table)
setDT(x)
set.seed(8)
nperms=1000
# get permutations of public/private status
perms = cbind(unique(x[,.(id2)]), unique(x[,.(id2,private)])[,lapply(1:nperms, function(p) sample(private, replace=F))])
# function to get R-sq differences
obsR <- function(r,c,id,p) {
r0 = lm_robust(r[p==0]~c[p==0],cluster=id[p==0],se_type="stata")$adj.r.squared
r1 = lm_robust(r[p==1]~c[p==1],cluster=id[p==1],se_type="stata")$adj.r.squared
r1-r0
}
# empirical r-sq diff
empirR = x[, obsR(roa,capex,id2,private)]
# simulated r-sq differences
simR = x[perms, on=.(id2)][, sapply(.SD, function(v) obsR(roa,capex,id2,v)), .SDcols = patterns("V")]
# pvalue
sum(simR>=empirR)/nperms
With the small provided dataset, this can do a thousand estimates and get the p-value in under 5 seconds on my machine.
user system elapsed
4.56 0.14 4.70