Bootstrapping CI for a quantile regression in R outside the quantreg framework-CodePudding

I have an object, model1, resulting from a quantile regression. In model1 I have 3 columns and 99 rows with a step of 1 centile like this:

> model1
    tau intercept  julian_day
1  0.01  17.25584 -0.02565709
2  0.02  19.11576 -0.04716498
3  0.03  20.92364 -0.07115870
4  0.04  22.74398 -0.09706853
5  0.05  24.54071 -0.12248334
6  0.06  26.19349 -0.14133451
7  0.07  27.83602 -0.15839657
8  0.08  29.64554 -0.18364227

I would like to calculate and plot a confidence interval for the values of the intercept and the variable, using a bootstrapping method.

I did not perform the quantile regression using the quantreg package, but instead I used this method:

#Functions

    minimize.logcosh <- function(par, X, y, tau) {
      diff <- y-(X %*% par)
      check <- (tau-0.5)*diff (0.5/0.7)*logcosh(0.7*diff) 0.4
      return(sum(check))
    }
    
    smrq <- function(X, y, tau){
      p <- ncol(X)
      op.result <- optim(
        rep(0, p),
        fn = minimize.logcosh,
        method = 'BFGS',
        X = X,
        y = y,
        tau = tau
      )
      beta <- op.result$par
      return(beta)
    }
    
    run_smrq <- function(data, fml, response) {
      x <- model.matrix(fml, data) #modify
      y <- data[[response]]
      #X <- cbind(x, rep(1,nrow(x)))
      X <- x
      
      n <- 99
      betas <- sapply(1:n, function(i) smrq(X, y, tau=i/(n 1)))
        return(betas) 
    }


#Callers

smrq_models <- df %>%
  group_by(lat_grouped) %>%
  group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day"))

Here is what df looks like:

> dput(head(df, 20))
structure(list(lat = c("59", "59", "55", "59", "59", "63", "59", 
"59", "59", "59", "63", "59", "59", "59", "57", "56", "56", "59", 
"63", "63"), long = c(18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 
18, 18, 18, 18, 18, 18, 18, 18, 18, 18), date = c("1951-03-22", 
"1951-04-08", "1952-02-03", "1952-03-08", "1953-02-22", "1953-03-12", 
"1954-01-16", "1954-02-06", "1954-03-14", "1954-03-28", "1954-04-02", 
"1955-01-23", "1955-03-06", "1955-03-13", "1955-04-08", "1955-04-11", 
"1955-04-12", "1956-03-25", "1956-04-01", "1956-04-02"), julian_day = c(81, 
98, 34, 68, 53, 71, 16, 37, 73, 87, 92, 23, 65, 72, 98, 101, 
102, 85, 92, 93), year = c(1951L, 1951L, 1952L, 1952L, 1953L, 
1953L, 1954L, 1954L, 1954L, 1954L, 1954L, 1955L, 1955L, 1955L, 
1955L, 1955L, 1955L, 1956L, 1956L, 1956L), decade = c("1950-1959", 
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959", 
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959", 
"1950-1959", "1950-1959", "1950-1959", "1950-1959", "1950-1959", 
"1950-1959", "1950-1959", "1950-1959", "1950-1959"), time = c(10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L), lat_grouped = c("1", "1", "1", 
"1", "1", "2", "1", "1", "1", "1", "2", "1", "1", "1", "1", "1", 
"1", "1", "2", "2"), year_centered = structure(c(-36, -36, -35, 
-35, -34, -34, -33, -33, -33, -33, -33, -32, -32, -32, -32, -32, 
-32, -31, -31, -31), class = "AsIs")), row.names = 24:43, class = "data.frame")

Do you have any tips on how to achieve my goal? I have tried many boot functions found on the internet but have not been able to make something work so far.

Thank you very much for the help, do not hesitate if I missed something, I can update my post!

CodePudding user response：

First of all, there is a function missing in the question's code.

logcosh <- function(x) log(cosh(x))

As for the problem, write a function to sample the data.frame's rows with replacement, and run the quantile regressions just like in the question. There is no need to save them in an object smrq_models because the results are to be immediately returned to caller.

Note that I have removed the constant n = 99 from function run_smrq and it is now an argument.

Then, call the boot function in a loop.

Finally, to have the bootstrapped estimates, the lists returned by the quantile regression code must be first made an array in order for apply to compute the mean values. Assign the row names and we're done.

library(tidyverse)

boot_fun <- function(data, n) {
  i <- sample(nrow(data), nrow(data), replace = TRUE)
  df <- data[i, ]
  df %>%
    group_by(lat_grouped) %>%
    group_map(~ run_smrq(data=., fml=julian_day~year_centered, response="julian_day", n=n))
}

set.seed(2022)

n <- 99L
R <- 10L
boot_smrq_models <- vector("list", length = R)
for(i in seq.int(R)) {
  boot_smrq_models[[i]] <- boot_fun(df, n)
}

l <- length(boot_smrq_models[[1]])
smrq_models_all <- vector("list", length = l)
smrq_models_int <- vector("list", length = l)
for(i in seq.int(l)) {
  tmp <- array(dim = c(R, dim(boot_smrq_models[[1]][[i]])))
  for(j in seq.int(R)) {
    tmp[j, , ] <- boot_smrq_models[[j]][[i]]
  }
  smrq_models_all[[i]] <- t(apply(tmp, 2:3, mean))
  smrq_models_int[[i]] <- apply(tmp, 2:3, quantile, probs = c(0.025, 0.975))
  rownames(smrq_models_all[[i]]) <- sprintf("tau_.02f", (1:99)/(99 1))
}

lapply(smrq_models_all, head)
#> [[1]]
#>                [,1]       [,2]
#> tau_0.01 -23.173185 -1.4624225
#> tau_0.02   3.509690 -0.6640458
#> tau_0.03 -13.748280 -1.2172158
#> tau_0.04 -38.373045 -1.9780045
#> tau_0.05  -8.166044 -1.0890235
#> tau_0.06  51.701066  0.6258957
#> 
#> [[2]]
#>               [,1]      [,2]
#> tau_0.01  1.387311 -2.315114
#> tau_0.02 36.060786 -1.311070
#> tau_0.03  9.177731 -2.111396
#> tau_0.04 22.320567 -1.731685
#> tau_0.05 40.143578 -1.212108
#> tau_0.06 19.900275 -1.812859

^{Created on 2022-05-22 by the reprex package (v2.0.1)}