Home > Software design >  Plotting 99% confidence interval and prediction interval struggle in R
Plotting 99% confidence interval and prediction interval struggle in R

Time:11-09

I am using the auto data from R I need to plot the confidence intervals but it is a struggle, this is what I got so far:

I have created the model for linear regression

my_acc<-auto_df$acceleration
my_horse<-auto_df$horsepower
mydata <- data.frame(my_acc, my_horse )

car_linear_regression <- lm(my_acc ~ my_horse, mydata )

I have created the confidence intervals for ONE prediction as the exercise is asking

conf_int<-predict(car_linear_regression,newdata = data.frame(my_horse = 93.5),interval = 'confidence' )
#data.frame(my_horse = 93.5) must be the same as in the original dataframe

pred_int<-predict(car_linear_regression,newdata = data.frame(my_horse = 93.5),interval = 'prediction' )

Then I am trying to plot everthing together but I am totally stuck, I can plot the data with the regression line, but I only get this error

Error in xy.coords(x, y) : 'x' and 'y' lengths differ

plot(my_acc ~ my_horse   , data = mydata, pch = 20, cex  = 1.5, col="blue", xlab=" car horsepower", ylab = "acceleration secs to 100km/h", main = "Confidence intervals and prediction intervals")
abline(car_linear_regression, lwd = 5,  col="red" )

lines(mydata$my_horse, conf_int[,"lwr"], col="red", type="b", pch=" ")


CodePudding user response:

For the plot you need definitely predictions on the whole range, i.e. min max of horespower.

data('Auto', package='ISLR')  

fo <- acceleration ~ horsepower  ## formula object for re-use

fit <- lm(fo, Auto)

We will need a sequence over the range of predictor horsepower, so a glance into summary is helpful.

summary(Auto)

Then we create a sequence for plotting with a reasonable step size. This will be what lines uses to plot the lines.

n_data <- with(Auto, seq(min(horsepower), max(horsepower), by=1))

Now calculate predictions using the sequences,

conf_int <- predict(fit, newdata=list(horsepower=n_data), 
                    interval='confidence', level=.99)
pred_int <- predict(fit, newdata=list(horsepower=n_data), 
                    interval='prediction', level=.99)

and plot the guy.

plot(fo, data=Auto, pch=20, cex=1, col="blue", 
     xlab=" car horsepower", ylab="acceleration secs to 100km/h", 
     main="Confidence intervals and prediction intervals", xlim=hp_rg)
abline(fit, lwd=2, col="red")
matlines(n_data, conf_int[, 2:3], lty='dashed', col="red", lwd=2)
matlines(n_data, pred_int[, 2:3], lty='dashed', col="green", lwd=2)
legend('topright', legend=c('conf_int', 'pred_int'), col=c("red", "green"),
       lty=2, lwd=2)

enter image description here

Note that I've used matlines here which is more concise, you could also use lines(n_data, conf_int[, 2], ..), lines(n_data, conf_int[, 3], ..) if you want.

CodePudding user response:

You can use ggplot2 and attach different data to different plotting layers or just base R:

library(tidyverse)

auto_df <- read_delim(
  file = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data",
  delim = " ",
  col_names = FALSE
)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_double(),
#>   X2 = col_character(),
#>   X3 = col_character(),
#>   X4 = col_character(),
#>   X5 = col_character(),
#>   X6 = col_character(),
#>   X7 = col_character(),
#>   X8 = col_character(),
#>   X9 = col_character(),
#>   X10 = col_character()
#> )
#> Warning: 246 parsing failures.
#> row col   expected     actual                                                                               file
#>   3  -- 10 columns 9 columns  'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
#>   5  -- 10 columns 9 columns  'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
#>   7  -- 10 columns 9 columns  'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
#>   9  -- 10 columns 9 columns  'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
#>  14  -- 10 columns 11 columns 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
#> ... ... .......... .......... ..................................................................................
#> See problems(...) for more details.
auto_df
#> # A tibble: 398 x 10
#>       X1 X2    X3      X4       X5       X6       X7    X8        X9      X10   
#>    <dbl> <chr> <chr>   <chr>    <chr>    <chr>    <chr> <chr>     <chr>   <chr> 
#>  1    18 "  8" "  307… "     1… "     3… "     1… "  7… " 1\t\"c… "cheve… "mali…
#>  2    15 "  8" "  350… "     1… "     3… "     1… "  7… " 1\t\"b… "skyla… "320\…
#>  3    18 "  8" "  318… "     1… "     3… "     1… "  7… " 1\t\"p… "satel…  <NA> 
#>  4    16 "  8" "  304… "     1… "     3… "     1… "  7… " 1\t\"a… "rebel" "sst\…
#>  5    17 "  8" "  302… "     1… "     3… "     1… "  7… " 1\t\"f… "torin…  <NA> 
#>  6    15 "  8" "  429… "     1… "     4… "     1… "  7… " 1\t\"f… "galax… "500\…
#>  7    14 "  8" "  454… "     2… "     4… "      … "  7… " 1\t\"c… "impal…  <NA> 
#>  8    14 "  8" "  440… "     2… "     4… "      … "  7… " 1\t\"p… "fury"  "iii\…
#>  9    14 "  8" "  455… "     2… "     4… "     1… "  7… " 1\t\"p… "catal…  <NA> 
#> 10    15 "  8" "  390… "     1… "     3… "      … "  7… " 1\t\"a… "ambas… "dpl\…
#> # … with 388 more rows

mydata <- tibble(my_acc = auto_df$X6, my_horse = auto_df$X4) %>% mutate_all(as.numeric)
#> Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
mydata
#> # A tibble: 398 x 2
#>    my_acc my_horse
#>     <dbl>    <dbl>
#>  1   12        130
#>  2   11.5      165
#>  3   11        150
#>  4   12        150
#>  5   10.5      140
#>  6   10        198
#>  7    9        220
#>  8    8.5      215
#>  9   10        225
#> 10    8.5      190
#> # … with 388 more rows

car_linear_regression <- lm(my_acc ~ my_horse, mydata)

conf_int <- predict(car_linear_regression, newdata = data.frame(my_horse = 93.5), interval = "confidence")
conf_int
#>       fit     lwr      upr
#> 1 16.0832 15.8765 16.28989
pred_int <- predict(car_linear_regression, newdata = data.frame(my_horse = 93.5), interval = "prediction")
pred_int
#>       fit      lwr      upr
#> 1 16.0832 12.14256 20.02383

# ggplot way
ggplot()  
  geom_rect(
    data = pred_int %>% as.data.frame(),
    mapping = aes(xmin = lwr, xmax = upr, ymin = -Inf, ymax = Inf),
    fill = "BurlyWood"
  )  
  geom_point(
    data = mydata,
    mapping = aes(my_acc, my_horse)
  )  
  stat_smooth(
    data = mydata,
    mapping = aes(my_acc, my_horse),
    method = "lm"
  )
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 6 rows containing non-finite values (stat_smooth).
#> Warning: Removed 6 rows containing missing values (geom_point).

# Base R way
plot(my_acc ~ my_horse   , data = mydata, pch = 20, cex  = 1.5, col="blue", xlab=" car horsepower", ylab = "acceleration secs to 100km/h", main = "Confidence intervals and prediction intervals")
rect(min(mydata$my_horse, na.rm = TRUE), pred_int[2], max(mydata$my_horse, na.rm = TRUE), pred_int[3])
abline(car_linear_regression, lwd = 5,  col="red" )

Created on 2021-11-08 by the reprex package (v2.0.1)

  • Related