Home > Enterprise >  Arima and Regression to predict Sales in R
Arima and Regression to predict Sales in R

Time:05-19

In the below sample data, there are five columns.

Column 1 is the name of the group
Column 2 is the date
Columns 3 and 4 are independent variables. 
Column 5 is the dependent variable (to be predicted) - last value in this column is NA, which is to be predicted.
library(data.table)
test_dt <- structure(list(group = c("B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"), date = structure(c(16491L, 
16575L, 16666L, 16757L, 16855L, 16939L, 17030L, 17121L, 17225L, 
17303L, 17394L, 17485L, 17596L, 17674L, 17765L, 17855L, 17960L, 
18038L, 18129L, 18220L, 18324L, 18402L, 18493L, 18584L, 18688L, 
18766L, 18857L, 18947L, 19052L, 16485L, 16574L, 16665L, 16756L, 
16849L, 16940L, 17031L, 17122L, 17218L, 17304L, 17395L, 17486L, 
17582L, 17668L, 17759L, 17850L, 17946L, 18032L, 18123L, 18214L, 
18310L, 18401L, 18492L, 18583L, 18676L, 18765L, 18856L, 18947L, 
19040L), class = c("IDate", "Date")), P1 = c(1, -1, 1, -1, 1, 
-1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 
1, -1, 1, -1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 
-1, -1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1), P2 = c(99.0913221263063, 
79.324894514768, 82.6705734616995, 0, 53.6739380022962, 2.50152532031725, 
22.5638051044083, 54.4412607449857, 0, 8.8553750966744, 21.5617305663032, 
23.7895995218171, 76.0915492957747, 69.6560196560197, 100, 5.37885874649207, 
52.8617134731603, 58.4073942410238, 100, 100, 59.9279835390946, 
79.7548605240913, 100, 100, 69.5046439628483, 100, 84.9653537563822, 
100, 38.675045823514, 59.4279042615295, 18.0385288966725, 0, 
11.3231963765772, 32.7659574468085, 74.7805734347572, 100, 54.1704035874439, 
74.4394618834081, 100, 88.025078369906, 100, 57.488986784141, 
20.494923857868, 59.8158379373849, 74.006908462867, 85.4922279792746, 
78.4701114488349, 92.1467764060356, 98.783185840708, 92.4103035878565, 
73.9038189533239, 97.1790808240888, 93.6557139904939, 56.8330955777461, 
56.9279493269992, 91.7260490894695, 65.3846153846154, 29.8549107142857
), sales = c(-0.324044069993523, 1.54771041187003, -0.259676023247202, 
1.10346804241903, 2.24850532882765, -1.38235294117647, -0.467223559394025, 
0.131527028804412, -1.59945550450911, 1.43531976744187, -0.337417865388034, 
1.73559822747416, -0.501462599247804, -0.323169874947316, 0.895973935303696, 
0.333188468781698, 1.18421052631579, 2.37235688499227, 3.22330097087378, 
0.964960847900032, 1.87074829931972, -0.794115188497857, -1.28388017118402, 
2.70902007791429, 0.0691682517724335, 1.26934843157847, -0.876484368688912, 
-4.72615338413603, NA, 0.933908045977017, -0.693444982336777, 
-1.30972941853772, 1.68558077436582, 0.842170929507158, 0.953757225433516, 
-2.00538358008074, -0.939442115912692, 0.363890832750169, 1.58627805003868, 
-0.489335006273528, -2.15820116442482, -2.7520986080119, -1.00603621730382, 
-0.800892133008924, -1.85854932690377, -2.27005870841488, -0.444181225940188, 
0.266217055639362, -1.47534189805222, -1.63002591323246, 0.400160064025612, 
-1.70737139039419, -0.187453973354756, 0.493970652331832, 0.00704671975195748, 
-1.06171201061712, -0.859118530418379, NA)), row.names = c(NA, 
-58L), class = c("data.table", "data.frame"))

> head(test_dt, 10)

    group       date P1        P2      sales
 1:     B 2015-02-25  1 99.091322 -0.3240441
 2:     B 2015-05-20 -1 79.324895  1.5477104
 3:     B 2015-08-19  1 82.670573 -0.2596760
 4:     B 2015-11-18 -1  0.000000  1.1034680
 5:     B 2016-02-24  1 53.673938  2.2485053
 6:     B 2016-05-18 -1  2.501525 -1.3823529
 7:     B 2016-08-17 -1 22.563805 -0.4672236
 8:     B 2016-11-16  1 54.441261  0.1315270
 9:     B 2017-02-28 -1  0.000000 -1.5994555
10:     B 2017-05-17 -1  8.855375  1.4353198

I want to predict the sales for the next quarter using the sales of previous 5 quarters in column sales (with auto-arima). Also, I want to use the columns P1 and P2 to improve the accuracy of sales prediction (regression).

Can someone please show how this can be achived?

CodePudding user response:

You can use fable. Full explanation you can find in Forecasting: Principles and practice

Example based on your data, with the key set to the group so forecasts are done for each group.

# fpp3 installs fable and a bunch of other needed libraries
# run the code below to install fpp3
# install.packages("fpp3")

library(fpp3)

# create training data
train_dat <- test_dt %>% 
  mutate(yq = yearquarter(date)) %>% 
  select(-date) %>% 
  filter(!is.na(sales)) %>% 
  tsibble(index = yq, key = group)

# fit models
fit <- train_dat %>% 
  model(arima = ARIMA(sales),
        regression = TSLM(sales ~ P1   P2))

# quick check on models. use tidy to see individual terms and estimates.
glance(fit)
# A tibble: 4 × 18
  group .model     sigma2 log_lik   AIC  AICc   BIC ar_roots ma_roots r_squared adj_r_squared statistic p_value    df
  <chr> <chr>       <dbl>   <dbl> <dbl> <dbl> <dbl> <list>   <list>       <dbl>         <dbl>     <dbl>   <dbl> <int>
1 A     arima        1.21   -41.9  89.8  90.8  93.8 <cpl>    <cpl>      NA            NA         NA      NA        NA
2 A     regression   1.37   -42.5  13.6  15.4  19.0 <NULL>   <NULL>      0.119         0.0483     1.68    0.206     3
3 B     arima        2.25   -50.6 107.  108.  111.  <cpl>    <cpl>      NA            NA         NA      NA        NA
4 B     regression   2.76   -52.4  33.2  35.0  38.6 <NULL>   <NULL>      0.0482       -0.0280     0.633   0.539     3
# … with 4 more variables: CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>

# data to forecast
newdat <- test_dt %>% 
  mutate(yq = yearquarter(date)) %>% 
  select(-date) %>% 
  filter(is.na(sales)) %>% 
  tsibble(index = yq, key = group)

#forecast regression model
fit %>% 
  select(regression) %>% 
  forecast(new_data = newdat)

# A fable: 2 x 7 [?]
# Key:     group, .model [2]
  group .model          yq        sales .mean    P1    P2
  <chr> <chr>        <qtr>       <dist> <dbl> <dbl> <dbl>
1 A     regression 2022 Q1  N(0.4, 1.7) 0.395     1  29.9
2 B     regression 2022 Q1 N(0.97, 3.2) 0.971     1  38.7
  • Related