In the below sample data, there are five columns.
Column 1 is the name of the group
Column 2 is the date
Columns 3 and 4 are independent variables.
Column 5 is the dependent variable (to be predicted) - last value in this column is NA, which is to be predicted.
library(data.table)
test_dt <- structure(list(group = c("B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"), date = structure(c(16491L,
16575L, 16666L, 16757L, 16855L, 16939L, 17030L, 17121L, 17225L,
17303L, 17394L, 17485L, 17596L, 17674L, 17765L, 17855L, 17960L,
18038L, 18129L, 18220L, 18324L, 18402L, 18493L, 18584L, 18688L,
18766L, 18857L, 18947L, 19052L, 16485L, 16574L, 16665L, 16756L,
16849L, 16940L, 17031L, 17122L, 17218L, 17304L, 17395L, 17486L,
17582L, 17668L, 17759L, 17850L, 17946L, 18032L, 18123L, 18214L,
18310L, 18401L, 18492L, 18583L, 18676L, 18765L, 18856L, 18947L,
19040L), class = c("IDate", "Date")), P1 = c(1, -1, 1, -1, 1,
-1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1,
1, -1, 1, -1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1,
-1, -1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1), P2 = c(99.0913221263063,
79.324894514768, 82.6705734616995, 0, 53.6739380022962, 2.50152532031725,
22.5638051044083, 54.4412607449857, 0, 8.8553750966744, 21.5617305663032,
23.7895995218171, 76.0915492957747, 69.6560196560197, 100, 5.37885874649207,
52.8617134731603, 58.4073942410238, 100, 100, 59.9279835390946,
79.7548605240913, 100, 100, 69.5046439628483, 100, 84.9653537563822,
100, 38.675045823514, 59.4279042615295, 18.0385288966725, 0,
11.3231963765772, 32.7659574468085, 74.7805734347572, 100, 54.1704035874439,
74.4394618834081, 100, 88.025078369906, 100, 57.488986784141,
20.494923857868, 59.8158379373849, 74.006908462867, 85.4922279792746,
78.4701114488349, 92.1467764060356, 98.783185840708, 92.4103035878565,
73.9038189533239, 97.1790808240888, 93.6557139904939, 56.8330955777461,
56.9279493269992, 91.7260490894695, 65.3846153846154, 29.8549107142857
), sales = c(-0.324044069993523, 1.54771041187003, -0.259676023247202,
1.10346804241903, 2.24850532882765, -1.38235294117647, -0.467223559394025,
0.131527028804412, -1.59945550450911, 1.43531976744187, -0.337417865388034,
1.73559822747416, -0.501462599247804, -0.323169874947316, 0.895973935303696,
0.333188468781698, 1.18421052631579, 2.37235688499227, 3.22330097087378,
0.964960847900032, 1.87074829931972, -0.794115188497857, -1.28388017118402,
2.70902007791429, 0.0691682517724335, 1.26934843157847, -0.876484368688912,
-4.72615338413603, NA, 0.933908045977017, -0.693444982336777,
-1.30972941853772, 1.68558077436582, 0.842170929507158, 0.953757225433516,
-2.00538358008074, -0.939442115912692, 0.363890832750169, 1.58627805003868,
-0.489335006273528, -2.15820116442482, -2.7520986080119, -1.00603621730382,
-0.800892133008924, -1.85854932690377, -2.27005870841488, -0.444181225940188,
0.266217055639362, -1.47534189805222, -1.63002591323246, 0.400160064025612,
-1.70737139039419, -0.187453973354756, 0.493970652331832, 0.00704671975195748,
-1.06171201061712, -0.859118530418379, NA)), row.names = c(NA,
-58L), class = c("data.table", "data.frame"))
> head(test_dt, 10)
group date P1 P2 sales
1: B 2015-02-25 1 99.091322 -0.3240441
2: B 2015-05-20 -1 79.324895 1.5477104
3: B 2015-08-19 1 82.670573 -0.2596760
4: B 2015-11-18 -1 0.000000 1.1034680
5: B 2016-02-24 1 53.673938 2.2485053
6: B 2016-05-18 -1 2.501525 -1.3823529
7: B 2016-08-17 -1 22.563805 -0.4672236
8: B 2016-11-16 1 54.441261 0.1315270
9: B 2017-02-28 -1 0.000000 -1.5994555
10: B 2017-05-17 -1 8.855375 1.4353198
I want to predict the sales for the next quarter using the sales of previous 5 quarters in column sales
(with auto-arima). Also, I want to use the columns P1
and P2
to improve the accuracy of sales
prediction (regression).
Can someone please show how this can be achived?
CodePudding user response:
You can use fable. Full explanation you can find in Forecasting: Principles and practice
Example based on your data, with the key set to the group so forecasts are done for each group.
# fpp3 installs fable and a bunch of other needed libraries
# run the code below to install fpp3
# install.packages("fpp3")
library(fpp3)
# create training data
train_dat <- test_dt %>%
mutate(yq = yearquarter(date)) %>%
select(-date) %>%
filter(!is.na(sales)) %>%
tsibble(index = yq, key = group)
# fit models
fit <- train_dat %>%
model(arima = ARIMA(sales),
regression = TSLM(sales ~ P1 P2))
# quick check on models. use tidy to see individual terms and estimates.
glance(fit)
# A tibble: 4 × 18
group .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots r_squared adj_r_squared statistic p_value df
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <int>
1 A arima 1.21 -41.9 89.8 90.8 93.8 <cpl> <cpl> NA NA NA NA NA
2 A regression 1.37 -42.5 13.6 15.4 19.0 <NULL> <NULL> 0.119 0.0483 1.68 0.206 3
3 B arima 2.25 -50.6 107. 108. 111. <cpl> <cpl> NA NA NA NA NA
4 B regression 2.76 -52.4 33.2 35.0 38.6 <NULL> <NULL> 0.0482 -0.0280 0.633 0.539 3
# … with 4 more variables: CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>
# data to forecast
newdat <- test_dt %>%
mutate(yq = yearquarter(date)) %>%
select(-date) %>%
filter(is.na(sales)) %>%
tsibble(index = yq, key = group)
#forecast regression model
fit %>%
select(regression) %>%
forecast(new_data = newdat)
# A fable: 2 x 7 [?]
# Key: group, .model [2]
group .model yq sales .mean P1 P2
<chr> <chr> <qtr> <dist> <dbl> <dbl> <dbl>
1 A regression 2022 Q1 N(0.4, 1.7) 0.395 1 29.9
2 B regression 2022 Q1 N(0.97, 3.2) 0.971 1 38.7