In the data.table
below, column value
shows the sales growth at each quarterly and annual event of groups A and B for the years 2017 and 2018.
Sometimes, groups A or B don't hold the quarterly event and just hold the annual event. In such a scenario, I want to create a new row in test_dt
and copy the annual growth value to the quarter's growth value.
test_dt = structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B", "B"), yq = structure(c(17167,
17257, 17348, 17440, 17532, 17622, 17713, 17805, 17167, 17257,
17257, 17348, 17440, 17532, 17622, 17713, 17805), fiscal_start = 1, class = c("yearquarter",
"vctrs_vctr")), event_type = c("quarterly", "annual", "quarterly",
"quarterly", "quarterly", "quarterly", "quarterly", "quarterly",
"quarterly", "annual", "quarterly", "quarterly", "quarterly",
"quarterly", "annual", "quarterly", "quarterly"), value = c(4.39958592132506,
-10.9243697478992, 3.8793103448276, -6.7936507936508, -3.44322344322345,
4.41176470588236, -4.99153976311336, -1.98321891685735, -18.2953077916487,
0.918273645546375, -4.99770747363595, 3.1849912739965, -10.7555555555556,
-12.2950819672131, -1.15257958287595, -17.8796046720575, -6.99300699300699
)), row.names = c(NA, -17L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x5642ee9663a0>)
> test_dt
group yq event_type value
1: A 2017 Q1 quarterly 4.3995859
2: A 2017 Q2 annual -10.9243697
3: A 2017 Q3 quarterly 3.8793103
4: A 2017 Q4 quarterly -6.7936508
5: A 2018 Q1 quarterly -3.4432234
6: A 2018 Q2 quarterly 4.4117647
7: A 2018 Q3 quarterly -4.9915398
8: A 2018 Q4 quarterly -1.9832189
9: B 2017 Q1 quarterly -18.2953078
10: B 2017 Q2 annual 0.9182736
11: B 2017 Q2 quarterly -4.9977075
12: B 2017 Q3 quarterly 3.1849913
13: B 2017 Q4 quarterly -10.7555556
14: B 2018 Q1 quarterly -12.2950820
15: B 2018 Q2 annual -1.1525796
16: B 2018 Q3 quarterly -17.8796047
17: B 2018 Q4 quarterly -6.9930070
Here is the expected output
> test_dt
group yq event_type value
1: A 2017 Q1 quarterly 4.3995859
2: A 2017 Q2 annual -10.9243697
2: A 2017 Q2 quarterly -10.9243697 <---- New row added
3: A 2017 Q3 quarterly 3.8793103
4: A 2017 Q4 quarterly -6.7936508
5: A 2018 Q1 quarterly -3.4432234
6: A 2018 Q2 quarterly 4.4117647
7: A 2018 Q3 quarterly -4.9915398
8: A 2018 Q4 quarterly -1.9832189
9: B 2017 Q1 quarterly -18.2953078
10: B 2017 Q2 annual 0.9182736
11: B 2017 Q2 quarterly -4.9977075
12: B 2017 Q3 quarterly 3.1849913
13: B 2017 Q4 quarterly -10.7555556
14: B 2018 Q1 quarterly -12.2950820
15: B 2018 Q2 annual -1.1525796
15: B 2018 Q2 quarterly -1.1525796 <---- New row added
16: B 2018 Q3 quarterly -17.8796047
17: B 2018 Q4 quarterly -6.9930070
Please suggest a solution only using data.table
CodePudding user response:
I solved it as follows. Please suggest if efficiency can be improved any further.
setkey(test_dt, group, yq, event_type)
dt_1 = test_dt[CJ(group, yq, event_type, unique=TRUE)]
dt_1[, value := ifelse(event_type == "quarterly" & is.na(value), value[event_type == "annual"], value), by = .(group, yq)]
na.omit(dt_1[])
> dt_1
group yq event_type value
1: A 2017 Q1 quarterly 4.3995859
2: A 2017 Q2 annual -10.9243697
3: A 2017 Q2 quarterly -10.9243697
4: A 2017 Q3 quarterly 3.8793103
5: A 2017 Q4 quarterly -6.7936508
6: A 2018 Q1 quarterly -3.4432234
7: A 2018 Q2 quarterly 4.4117647
8: A 2018 Q3 quarterly -4.9915398
9: A 2018 Q4 quarterly -1.9832189
10: B 2017 Q1 quarterly -18.2953078
11: B 2017 Q2 annual 0.9182736
12: B 2017 Q2 quarterly -4.9977075
13: B 2017 Q3 quarterly 3.1849913
14: B 2017 Q4 quarterly -10.7555556
15: B 2018 Q1 quarterly -12.2950820
16: B 2018 Q2 annual -1.1525796
17: B 2018 Q2 quarterly -1.1525796
18: B 2018 Q3 quarterly -17.8796047
19: B 2018 Q4 quarterly -6.9930070
CodePudding user response:
Good to see a fellow efficiency fanatic. Here is my attempt
library(data.table)
# group
x <- c("A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B")
# dummy data
df <- structure(list(group = x
, yq = structure(c(17167,17257, 17348, 17440, 17532, 17622, 17713, 17805, 17167, 17257, 17257, 17348, 17440, 17532, 17622, 17713, 17805)
, fiscal_start = 1
, class = c("yearquarter","vctrs_vctr")
)
, event_type = c("quarterly", "annual", "quarterly","quarterly", "quarterly", "quarterly", "quarterly", "quarterly", "quarterly", "annual", "quarterly", "quarterly", "quarterly", "quarterly", "annual", "quarterly", "quarterly")
, value = c(4.39958592132506,-10.9243697478992, 3.8793103448276, -6.7936507936508, -3.44322344322345, 4.41176470588236, -4.99153976311336, -1.98321891685735, -18.2953077916487, 0.918273645546375, -4.99770747363595, 3.1849912739965, -10.7555555555556, -12.2950819672131, -1.15257958287595, -17.8796046720575, -6.99300699300699))
, row.names = c(NA, -17L)
, class = c("data.table", "data.frame")
)
y <- 1e4
z <- vector('list', y)
for (i in 1:y)
{
z[[i]] <- df
}
df <- rbindlist(z)
# create distinct group across batches (of dfs)
df[, group := paste0(df$group, rep(1:y, each=length(x)))]
# benchmark
a <-
microbenchmark(Saurabh = { setkey(df, group, yq, event_type)
dt_1 = df[CJ(group, yq, event_type, unique=TRUE)]
dt_1[, value := ifelse(event_type == "quarterly" & is.na(value), value[event_type == "annual"], value), by = .(group, yq)]
na.omit(dt_1[])
}
, akrun = {df[df[, .(yq = if('annual' %in% event_type) yq else first(yq) 0:3), .(group, yr = year(yq), event_type)
][, yr := NULL
], on = .(group, yq, event_type)
][order(group, yq, event_type == 'quarterly')
][, value := nafill(value, type = 'locf')
]
}
, sweepydodo = {df[, `:=` (yq = tsibble::yearquarter(yq)
, year = year(yq)
, quarter = quarter(yq)
)
][, dummy := sum(chmatch(event_type, 'quarterly'), na.rm=T)
, .(group, year)
]
x <- df[event_type == 'annual'
& dummy < 4
][, event_type := 'quarterly'
]
df <- rbind(df, x)[, c('dummy', 'year', 'quarter') := NULL
][order(group, yq, event_type)
]
}
, times = 20
)
# plot
autoplot(a)
Let me know any comments you may have