Home > OS >  Fill the missing quarter values from annual values using R
Fill the missing quarter values from annual values using R

Time:06-20

In the data.table below, column value shows the sales growth at each quarterly and annual event of groups A and B for the years 2017 and 2018.

Sometimes, groups A or B don't hold the quarterly event and just hold the annual event. In such a scenario, I want to create a new row in test_dt and copy the annual growth value to the quarter's growth value.

test_dt = structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A", 
"B", "B", "B", "B", "B", "B", "B", "B", "B"), yq = structure(c(17167, 
17257, 17348, 17440, 17532, 17622, 17713, 17805, 17167, 17257, 
17257, 17348, 17440, 17532, 17622, 17713, 17805), fiscal_start = 1, class = c("yearquarter", 
"vctrs_vctr")), event_type = c("quarterly", "annual", "quarterly", 
"quarterly", "quarterly", "quarterly", "quarterly", "quarterly", 
"quarterly", "annual", "quarterly", "quarterly", "quarterly", 
"quarterly", "annual", "quarterly", "quarterly"), value = c(4.39958592132506, 
-10.9243697478992, 3.8793103448276, -6.7936507936508, -3.44322344322345, 
4.41176470588236, -4.99153976311336, -1.98321891685735, -18.2953077916487, 
0.918273645546375, -4.99770747363595, 3.1849912739965, -10.7555555555556, 
-12.2950819672131, -1.15257958287595, -17.8796046720575, -6.99300699300699
)), row.names = c(NA, -17L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x5642ee9663a0>)

> test_dt
    group      yq event_type       value
 1:     A 2017 Q1  quarterly   4.3995859
 2:     A 2017 Q2     annual -10.9243697
 3:     A 2017 Q3  quarterly   3.8793103
 4:     A 2017 Q4  quarterly  -6.7936508
 5:     A 2018 Q1  quarterly  -3.4432234
 6:     A 2018 Q2  quarterly   4.4117647
 7:     A 2018 Q3  quarterly  -4.9915398
 8:     A 2018 Q4  quarterly  -1.9832189
 9:     B 2017 Q1  quarterly -18.2953078
10:     B 2017 Q2     annual   0.9182736
11:     B 2017 Q2  quarterly  -4.9977075
12:     B 2017 Q3  quarterly   3.1849913
13:     B 2017 Q4  quarterly -10.7555556
14:     B 2018 Q1  quarterly -12.2950820
15:     B 2018 Q2     annual  -1.1525796
16:     B 2018 Q3  quarterly -17.8796047
17:     B 2018 Q4  quarterly  -6.9930070

Here is the expected output

> test_dt
    group      yq event_type       value
 1:     A 2017 Q1  quarterly   4.3995859
 2:     A 2017 Q2     annual -10.9243697
 2:     A 2017 Q2  quarterly -10.9243697 <---- New row added
 3:     A 2017 Q3  quarterly   3.8793103
 4:     A 2017 Q4  quarterly  -6.7936508
 5:     A 2018 Q1  quarterly  -3.4432234
 6:     A 2018 Q2  quarterly   4.4117647
 7:     A 2018 Q3  quarterly  -4.9915398
 8:     A 2018 Q4  quarterly  -1.9832189
 9:     B 2017 Q1  quarterly -18.2953078
10:     B 2017 Q2     annual   0.9182736
11:     B 2017 Q2  quarterly  -4.9977075
12:     B 2017 Q3  quarterly   3.1849913
13:     B 2017 Q4  quarterly -10.7555556
14:     B 2018 Q1  quarterly -12.2950820
15:     B 2018 Q2     annual  -1.1525796
15:     B 2018 Q2  quarterly  -1.1525796 <---- New row added
16:     B 2018 Q3  quarterly -17.8796047
17:     B 2018 Q4  quarterly  -6.9930070

Please suggest a solution only using data.table

CodePudding user response:

I solved it as follows. Please suggest if efficiency can be improved any further.

setkey(test_dt, group, yq, event_type)
dt_1 = test_dt[CJ(group, yq, event_type, unique=TRUE)]
dt_1[, value := ifelse(event_type == "quarterly" & is.na(value), value[event_type == "annual"], value), by = .(group, yq)]
na.omit(dt_1[])

> dt_1
    group      yq event_type       value
 1:     A 2017 Q1  quarterly   4.3995859
 2:     A 2017 Q2     annual -10.9243697
 3:     A 2017 Q2  quarterly -10.9243697
 4:     A 2017 Q3  quarterly   3.8793103
 5:     A 2017 Q4  quarterly  -6.7936508
 6:     A 2018 Q1  quarterly  -3.4432234
 7:     A 2018 Q2  quarterly   4.4117647
 8:     A 2018 Q3  quarterly  -4.9915398
 9:     A 2018 Q4  quarterly  -1.9832189
10:     B 2017 Q1  quarterly -18.2953078
11:     B 2017 Q2     annual   0.9182736
12:     B 2017 Q2  quarterly  -4.9977075
13:     B 2017 Q3  quarterly   3.1849913
14:     B 2017 Q4  quarterly -10.7555556
15:     B 2018 Q1  quarterly -12.2950820
16:     B 2018 Q2     annual  -1.1525796
17:     B 2018 Q2  quarterly  -1.1525796
18:     B 2018 Q3  quarterly -17.8796047
19:     B 2018 Q4  quarterly  -6.9930070

CodePudding user response:

Good to see a fellow efficiency fanatic. Here is my attempt

library(data.table)

# group
x <- c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B")

# dummy data
df <- structure(list(group = x
                     , yq = structure(c(17167,17257, 17348, 17440, 17532, 17622, 17713, 17805, 17167, 17257, 17257, 17348, 17440, 17532, 17622, 17713, 17805)
                                      , fiscal_start = 1
                                      , class = c("yearquarter","vctrs_vctr")
                                      )
                     , event_type = c("quarterly", "annual", "quarterly","quarterly", "quarterly", "quarterly", "quarterly", "quarterly", "quarterly", "annual", "quarterly", "quarterly", "quarterly", "quarterly", "annual", "quarterly", "quarterly")
                     , value = c(4.39958592132506,-10.9243697478992, 3.8793103448276, -6.7936507936508, -3.44322344322345, 4.41176470588236, -4.99153976311336, -1.98321891685735, -18.2953077916487, 0.918273645546375, -4.99770747363595, 3.1849912739965, -10.7555555555556, -12.2950819672131, -1.15257958287595, -17.8796046720575, -6.99300699300699))
                , row.names = c(NA, -17L)
                , class = c("data.table", "data.frame")
                )
y <- 1e4
z <- vector('list', y)
for (i in 1:y)
{
  z[[i]] <- df
}
df <- rbindlist(z)

# create distinct group across batches (of dfs)
df[, group := paste0(df$group, rep(1:y, each=length(x)))]

# benchmark
a <-
  microbenchmark(Saurabh = { setkey(df, group, yq, event_type)
                              dt_1 = df[CJ(group, yq, event_type, unique=TRUE)]
                              dt_1[, value := ifelse(event_type == "quarterly" & is.na(value), value[event_type == "annual"], value), by = .(group, yq)]
                              na.omit(dt_1[])
                              }
                 , akrun = {df[df[,  .(yq = if('annual' %in% event_type) yq else first(yq)   0:3), .(group, yr = year(yq), event_type)
                                  ][, yr := NULL
                                    ], on = .(group, yq, event_type)
                               ][order(group, yq, event_type == 'quarterly')
                                 ][, value := nafill(value, type = 'locf')
                                   ]
                              }
                 , sweepydodo = {df[, `:=` (yq = tsibble::yearquarter(yq)
                                             , year = year(yq)
                                             , quarter = quarter(yq)
                                             )
                                     ][, dummy := sum(chmatch(event_type, 'quarterly'), na.rm=T)
                                       , .(group, year)
                                       ]
                                  
                                  x <- df[event_type == 'annual'
                                          & dummy < 4
                                          ][, event_type := 'quarterly'
                                            ]
                                  
                                  df <- rbind(df, x)[, c('dummy', 'year', 'quarter') := NULL
                                                     ][order(group, yq, event_type)
                                                     ]
                                  }
                 , times = 20
                 )

# plot
autoplot(a)

benchmark

Let me know any comments you may have

  • Related