Arima and Regression to predict Sales in R-CodePudding

In the below sample data, there are five columns.

Column 1 is the name of the group
Column 2 is the date
Columns 3 and 4 are independent variables. 
Column 5 is the dependent variable (to be predicted) - last value in this column is NA, which is to be predicted.

library(data.table)
test_dt <- structure(list(group = c("B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"), date = structure(c(16491L, 
16575L, 16666L, 16757L, 16855L, 16939L, 17030L, 17121L, 17225L, 
17303L, 17394L, 17485L, 17596L, 17674L, 17765L, 17855L, 17960L, 
18038L, 18129L, 18220L, 18324L, 18402L, 18493L, 18584L, 18688L, 
18766L, 18857L, 18947L, 19052L, 16485L, 16574L, 16665L, 16756L, 
16849L, 16940L, 17031L, 17122L, 17218L, 17304L, 17395L, 17486L, 
17582L, 17668L, 17759L, 17850L, 17946L, 18032L, 18123L, 18214L, 
18310L, 18401L, 18492L, 18583L, 18676L, 18765L, 18856L, 18947L, 
19040L), class = c("IDate", "Date")), P1 = c(1, -1, 1, -1, 1, 
-1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 
1, -1, 1, -1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 
-1, -1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1), P2 = c(99.0913221263063, 
79.324894514768, 82.6705734616995, 0, 53.6739380022962, 2.50152532031725, 
22.5638051044083, 54.4412607449857, 0, 8.8553750966744, 21.5617305663032, 
23.7895995218171, 76.0915492957747, 69.6560196560197, 100, 5.37885874649207, 
52.8617134731603, 58.4073942410238, 100, 100, 59.9279835390946, 
79.7548605240913, 100, 100, 69.5046439628483, 100, 84.9653537563822, 
100, 38.675045823514, 59.4279042615295, 18.0385288966725, 0, 
11.3231963765772, 32.7659574468085, 74.7805734347572, 100, 54.1704035874439, 
74.4394618834081, 100, 88.025078369906, 100, 57.488986784141, 
20.494923857868, 59.8158379373849, 74.006908462867, 85.4922279792746, 
78.4701114488349, 92.1467764060356, 98.783185840708, 92.4103035878565, 
73.9038189533239, 97.1790808240888, 93.6557139904939, 56.8330955777461, 
56.9279493269992, 91.7260490894695, 65.3846153846154, 29.8549107142857
), sales = c(-0.324044069993523, 1.54771041187003, -0.259676023247202, 
1.10346804241903, 2.24850532882765, -1.38235294117647, -0.467223559394025, 
0.131527028804412, -1.59945550450911, 1.43531976744187, -0.337417865388034, 
1.73559822747416, -0.501462599247804, -0.323169874947316, 0.895973935303696, 
0.333188468781698, 1.18421052631579, 2.37235688499227, 3.22330097087378, 
0.964960847900032, 1.87074829931972, -0.794115188497857, -1.28388017118402, 
2.70902007791429, 0.0691682517724335, 1.26934843157847, -0.876484368688912, 
-4.72615338413603, NA, 0.933908045977017, -0.693444982336777, 
-1.30972941853772, 1.68558077436582, 0.842170929507158, 0.953757225433516, 
-2.00538358008074, -0.939442115912692, 0.363890832750169, 1.58627805003868, 
-0.489335006273528, -2.15820116442482, -2.7520986080119, -1.00603621730382, 
-0.800892133008924, -1.85854932690377, -2.27005870841488, -0.444181225940188, 
0.266217055639362, -1.47534189805222, -1.63002591323246, 0.400160064025612, 
-1.70737139039419, -0.187453973354756, 0.493970652331832, 0.00704671975195748, 
-1.06171201061712, -0.859118530418379, NA)), row.names = c(NA, 
-58L), class = c("data.table", "data.frame"))

> head(test_dt, 10)

    group       date P1        P2      sales
 1:     B 2015-02-25  1 99.091322 -0.3240441
 2:     B 2015-05-20 -1 79.324895  1.5477104
 3:     B 2015-08-19  1 82.670573 -0.2596760
 4:     B 2015-11-18 -1  0.000000  1.1034680
 5:     B 2016-02-24  1 53.673938  2.2485053
 6:     B 2016-05-18 -1  2.501525 -1.3823529
 7:     B 2016-08-17 -1 22.563805 -0.4672236
 8:     B 2016-11-16  1 54.441261  0.1315270
 9:     B 2017-02-28 -1  0.000000 -1.5994555
10:     B 2017-05-17 -1  8.855375  1.4353198

I want to predict the sales for the next quarter using the sales of previous 5 quarters in column sales (with auto-arima). Also, I want to use the columns P1 and P2 to improve the accuracy of sales prediction (regression).

Can someone please show how this can be achived?

CodePudding user response：

You can use fable. Full explanation you can find in Forecasting: Principles and practice

Example based on your data, with the key set to the group so forecasts are done for each group.

# fpp3 installs fable and a bunch of other needed libraries
# run the code below to install fpp3
# install.packages("fpp3")

library(fpp3)

# create training data
train_dat <- test_dt %>% 
  mutate(yq = yearquarter(date)) %>% 
  select(-date) %>% 
  filter(!is.na(sales)) %>% 
  tsibble(index = yq, key = group)

# fit models
fit <- train_dat %>% 
  model(arima = ARIMA(sales),
        regression = TSLM(sales ~ P1   P2))

# quick check on models. use tidy to see individual terms and estimates.
glance(fit)
# A tibble: 4 × 18
  group .model     sigma2 log_lik   AIC  AICc   BIC ar_roots ma_roots r_squared adj_r_squared statistic p_value    df
  <chr> <chr>       <dbl>   <dbl> <dbl> <dbl> <dbl> <list>   <list>       <dbl>         <dbl>     <dbl>   <dbl> <int>
1 A     arima        1.21   -41.9  89.8  90.8  93.8 <cpl>    <cpl>      NA            NA         NA      NA        NA
2 A     regression   1.37   -42.5  13.6  15.4  19.0 <NULL>   <NULL>      0.119         0.0483     1.68    0.206     3
3 B     arima        2.25   -50.6 107.  108.  111.  <cpl>    <cpl>      NA            NA         NA      NA        NA
4 B     regression   2.76   -52.4  33.2  35.0  38.6 <NULL>   <NULL>      0.0482       -0.0280     0.633   0.539     3
# … with 4 more variables: CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>

# data to forecast
newdat <- test_dt %>% 
  mutate(yq = yearquarter(date)) %>% 
  select(-date) %>% 
  filter(is.na(sales)) %>% 
  tsibble(index = yq, key = group)

#forecast regression model
fit %>% 
  select(regression) %>% 
  forecast(new_data = newdat)

# A fable: 2 x 7 [?]
# Key:     group, .model [2]
  group .model          yq        sales .mean    P1    P2
  <chr> <chr>        <qtr>       <dist> <dbl> <dbl> <dbl>
1 A     regression 2022 Q1  N(0.4, 1.7) 0.395     1  29.9
2 B     regression 2022 Q1 N(0.97, 3.2) 0.971     1  38.7