I have an object, model1
, resulting from a quantile regression. In model1
I have 3 columns and 99 rows with a step of 1 centile like this:
> model1
tau intercept julian_day
1 0.01 17.25584 -0.02565709
2 0.02 19.11576 -0.04716498
3 0.03 20.92364 -0.07115870
4 0.04 22.74398 -0.09706853
5 0.05 24.54071 -0.12248334
6 0.06 26.19349 -0.14133451
7 0.07 27.83602 -0.15839657
8 0.08 29.64554 -0.18364227
I would like to calculate and plot a confidence interval for the values of the intercept and the variable, using a bootstrapping method.
I did not perform the quantile regression using the quantreg
package, but instead I used this method:
#Functions
minimize.logcosh <- function(par, X, y, tau) {
diff <- y-(X %*% par)
check <- (tau-0.5)*diff (0.5/0.7)*logcosh(0.7*diff) 0.4
return(sum(check))
}
smrq <- function(X, y, tau){
p <- ncol(X)
op.result <- optim(
rep(0, p),
fn = minimize.logcosh,
method = 'BFGS',
X = X,
y = y,
tau = tau
)
beta <- op.result$par
return(beta)
}
run_smrq <- function(data, fml, response) {
x <- model.matrix(fml, data) #modify
y <- data[[response]]
#X <- cbind(x, rep(1,nrow(x)))
X <- x
n <- 99
betas <- sapply(1:n, function(i) smrq(X, y, tau=i/(n 1)))
return(betas)
}
#Callers
smrq_models <- df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day"))
Here is what df
looks like:
> dput(head(df, 20))
structure(list(lat = c("59", "59", "55", "59", "59", "63", "59",
"59", "59", "59", "63", "59", "59", "59", "57", "56", "56", "59",
"63", "63"), long = c(18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18), date = c("1951-03-22",
"1951-04-08", "1952-02-03", "1952-03-08", "1953-02-22", "1953-03-12",
"1954-01-16", "1954-02-06", "1954-03-14", "1954-03-28", "1954-04-02",
"1955-01-23", "1955-03-06", "1955-03-13", "1955-04-08", "1955-04-11",
"1955-04-12", "1956-03-25", "1956-04-01", "1956-04-02"), julian_day = c(81,
98, 34, 68, 53, 71, 16, 37, 73, 87, 92, 23, 65, 72, 98, 101,
102, 85, 92, 93), year = c(1951L, 1951L, 1952L, 1952L, 1953L,
1953L, 1954L, 1954L, 1954L, 1954L, 1954L, 1955L, 1955L, 1955L,
1955L, 1955L, 1955L, 1956L, 1956L, 1956L), decade = c("1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959",
"1950-1959", "1950-1959", "1950-1959", "1950-1959"), time = c(10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L), lat_grouped = c("1", "1", "1",
"1", "1", "2", "1", "1", "1", "1", "2", "1", "1", "1", "1", "1",
"1", "1", "2", "2"), year_centered = structure(c(-36, -36, -35,
-35, -34, -34, -33, -33, -33, -33, -33, -32, -32, -32, -32, -32,
-32, -31, -31, -31), class = "AsIs")), row.names = 24:43, class = "data.frame")
Do you have any tips on how to achieve my goal? I have tried many boot functions found on the internet but have not been able to make something work so far.
Thank you very much for the help, do not hesitate if I missed something, I can update my post!
CodePudding user response:
First of all, there is a function missing in the question's code.
logcosh <- function(x) log(cosh(x))
As for the problem, write a function to sample the data.frame's rows with replacement, and run the quantile regressions just like in the question. There is no need to save them in an object smrq_models
because the results are to be immediately returned to caller.
Note that I have removed the constant n = 99
from function run_smrq
and it is now an argument.
Then, call the boot function in a loop.
Finally, to have the bootstrapped estimates, the lists returned by the quantile regression code must be first made an array in order for apply
to compute the mean values. Assign the row names and we're done.
library(tidyverse)
boot_fun <- function(data, n) {
i <- sample(nrow(data), nrow(data), replace = TRUE)
df <- data[i, ]
df %>%
group_by(lat_grouped) %>%
group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day", n=n))
}
set.seed(2022)
n <- 99L
R <- 10L
boot_smrq_models <- vector("list", length = R)
for(i in seq.int(R)) {
boot_smrq_models[[i]] <- boot_fun(df, n)
}
l <- length(boot_smrq_models[[1]])
smrq_models_all <- vector("list", length = l)
smrq_models_int <- vector("list", length = l)
for(i in seq.int(l)) {
tmp <- array(dim = c(R, dim(boot_smrq_models[[1]][[i]])))
for(j in seq.int(R)) {
tmp[j, , ] <- boot_smrq_models[[j]][[i]]
}
smrq_models_all[[i]] <- t(apply(tmp, 2:3, mean))
smrq_models_int[[i]] <- apply(tmp, 2:3, quantile, probs = c(0.025, 0.975))
rownames(smrq_models_all[[i]]) <- sprintf("tau_.02f", (1:99)/(99 1))
}
lapply(smrq_models_all, head)
#> [[1]]
#> [,1] [,2]
#> tau_0.01 -23.173185 -1.4624225
#> tau_0.02 3.509690 -0.6640458
#> tau_0.03 -13.748280 -1.2172158
#> tau_0.04 -38.373045 -1.9780045
#> tau_0.05 -8.166044 -1.0890235
#> tau_0.06 51.701066 0.6258957
#>
#> [[2]]
#> [,1] [,2]
#> tau_0.01 1.387311 -2.315114
#> tau_0.02 36.060786 -1.311070
#> tau_0.03 9.177731 -2.111396
#> tau_0.04 22.320567 -1.731685
#> tau_0.05 40.143578 -1.212108
#> tau_0.06 19.900275 -1.812859
Created on 2022-05-22 by the reprex package (v2.0.1)