Home > database >  Optimize steps extend sequence date, intersect and compute by group with data.table
Optimize steps extend sequence date, intersect and compute by group with data.table

Time:01-13

I have a huge dataset, where I use data.table package due fast computing. Having this type of dataset:

library(data.table)
library(dplyr)

dt <- data.table(
  gr1 = rep(LETTERS[1:2], each = 4),
  gr2 = rep(letters[3:6], each = 2),
  date1 = as.Date(c('2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04', '2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04')),
  date2 = as.Date(c('2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07', '2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07')),
  value = 1:8
)
dt

   gr1 gr2      date1      date2 value
1:   A   c 2020-01-01 2020-01-05     1
2:   A   c 2020-02-01 2020-02-05     2
3:   A   d 2020-02-01 2020-02-02     3
4:   A   d 2020-02-04 2020-02-07     4
5:   B   e 2020-01-01 2020-01-05     5
6:   B   e 2020-02-01 2020-02-05     6
7:   B   f 2020-02-01 2020-02-02     7
8:   B   f 2020-02-04 2020-02-07     8

I want to sum value column on those dates (result of date-range sequence) present on all gr2 that correspond to the same gr1. (independent calculation between gr1).

My workaround is:

  1. Create a dataset with date column extending date ranges (date1 and date2)
dt2 <- dt[, .(gr1, gr2, date = seq(date1, date2, by = 'day'), value), by = 1:nrow(dt)]
  1. Add is_shared column if date is present on all gr2 for each gr1, using Reduce and intersect functions, found here
dt2[, date := as.character(date)]

dt3 <- split(dt2, by = 'gr1') %>% lapply(function(x) {
  
  dates <- Reduce(intersect, x[, .(list(unique(date))), gr2]$V1)
  x[, is_shared := date %in% dates][]
  
}) %>% rbindlist()
dt3

    gr1 gr2       date value is_shared
 1:   A   c 2020-01-01     1     FALSE
 2:   A   c 2020-01-02     1     FALSE
 3:   A   c 2020-01-03     1     FALSE
 4:   A   c 2020-01-04     1     FALSE
 5:   A   c 2020-01-05     1     FALSE
 6:   A   c 2020-02-01     2      TRUE
 7:   A   c 2020-02-02     2      TRUE
 8:   A   c 2020-02-03     2     FALSE
 9:   A   c 2020-02-04     2      TRUE
10:   A   c 2020-02-05     2      TRUE
11:   A   d 2020-02-01     3      TRUE
12:   A   d 2020-02-02     3      TRUE
13:   A   d 2020-02-04     4      TRUE
14:   A   d 2020-02-05     4      TRUE
15:   A   d 2020-02-06     4     FALSE
16:   A   d 2020-02-07     4     FALSE
17:   B   e 2020-01-01     5     FALSE
18:   B   e 2020-01-02     5     FALSE
19:   B   e 2020-01-03     5     FALSE
20:   B   e 2020-01-04     5     FALSE
21:   B   e 2020-01-05     5     FALSE
22:   B   e 2020-02-01     6      TRUE
23:   B   e 2020-02-02     6      TRUE
24:   B   e 2020-02-03     6     FALSE
25:   B   e 2020-02-04     6      TRUE
26:   B   e 2020-02-05     6      TRUE
27:   B   f 2020-02-01     7      TRUE
28:   B   f 2020-02-02     7      TRUE
29:   B   f 2020-02-04     8      TRUE
30:   B   f 2020-02-05     8      TRUE
31:   B   f 2020-02-06     8     FALSE
32:   B   f 2020-02-07     8     FALSE
  1. Filter shared dates and compute calculation by gr1
dt4 <- dt3[is_shared == TRUE][, .(value = sum(value)), by = .(gr1, date)]
dt4

   gr1       date value
1:   A 2020-02-01     5
2:   A 2020-02-02     5
3:   A 2020-02-04     6
4:   A 2020-02-05     6
5:   B 2020-02-01    13
6:   B 2020-02-02    13
7:   B 2020-02-04    14
8:   B 2020-02-05    14

Problem:

  • Huge size of dt2
  • split and lapply step produces crash on my system (15Gb RAM and 4Gb of Swap)

Possible optimizations:

  • Avoid dt2 and dt3 objects, due expand dates from date ranges.
  • I tried to use .I when creating date-sequence by row, but I have an error 'from' must be of length 1. So I changed to 1:nrow(dt), that creates an unnecessary column called nrow (removed on posterior calculations).
  • Do not convert date to character class on dt2 (needed when search with %in% into lapply

CodePudding user response:

Try this,

fun <- function(d1, d2, v, g2) {
  tmp <- as.data.table(tidyr::unnest(
    cbind(data.table(v=v, d=Map(seq, d1, d2, by = "days")), g2=g2),
    d))
  allg3 <- unique(g2)
  tmp[, .SD[all(allg3 %in% g2),], by = d][, .(value = sum(v)), by = d]
}

dt[, fun(date1, date2, value, gr2), by = gr1]
#       gr1          d value
#    <char>     <Date> <int>
# 1:      A 2020-02-01     5
# 2:      A 2020-02-02     5
# 3:      A 2020-02-04     6
# 4:      A 2020-02-05     6
# 5:      B 2020-02-01    13
# 6:      B 2020-02-02    13
# 7:      B 2020-02-04    14
# 8:      B 2020-02-05    14

There is not (yet? data.table#2146 and data.table#3672) a data.table-internal unnest function, and the discussion in those issues suggests that tidyr::unnest is efficient-enough at this to preclude jumping into one themselves.

CodePudding user response:

Two more possible solutions, with benchmarking against the OP's and r2evan's solutions:

library(data.table)
library(dplyr)

fun <- function(d1, d2, v, g2) {
  # needed for r2evan's solution
  tmp <- as.data.table(tidyr::unnest(
    cbind(data.table(v=v, date=Map(seq, d1, d2, by = "days")), g2=g2),
    date))
  allg3 <- unique(g2)
  tmp[, .SD[all(allg3 %in% g2),], by = date][, .(value = sum(v)), by = date]
}

OP's solution

f1 <- function(dt) {
  # OP solution
  dt2 <- dt[, .(gr1, gr2, date = seq(date1, date2, by = 'day'), value), by = 1:nrow(dt)]
  
  dt3 <- split(dt2, by = 'gr1') %>% lapply(function(x) {
    
    dates <- Reduce(intersect, x[, .(list(unique(date))), gr2]$V1)
    x[, is_shared := date %in% dates][]
    
  }) %>% rbindlist()
  dt3[is_shared == TRUE][, .(value = sum(value)), by = .(gr1, date)]
}

Option 2:

f2 <- function(dt) {
  # expand the dates, vectorized with sequence
  d <- as.integer(dt$date2 - dt$date1)   1L
  data.table(
    gr1 = rep.int(dt$gr1, d),
    date = as.Date(sequence(d, dt$date1), origin = "1970-01-01"),
    value = rep.int(dt$value, d)
  )[
    , .(value = .(value)), .(gr1, date) # aggregate value into vectors by gr1 and date
  ][
    lengths(value) > 1 # filter on duplicates
  ][
    , value := unlist(lapply(value, sum)) # sum the values
  ]
}

Option 3

f3 <- function(dt) {
  # expand the dates, vectorized with sequence
  d <- as.integer(dt$date2 - dt$date1)   1L
  data.table(
    gr1 = rep.int(dt$gr1, d),
    date = as.Date(sequence(d, dt$date1), origin = "1970-01-01"),
    value = rep.int(dt$value, d)
  )[
    , .(value = sum(value), .N), .(gr1, date) # sum value by gr1 and date
  ][
    N > 1 # filter on duplicates
  ][
    , N := NULL # remove count column
  ]
}

Benchmarking:

microbenchmark::microbenchmark(f1 = f1(dt),
                               r2evans = dt[, fun(date1, date2, value, gr2), by = gr1],
                               f2 = f2(dt),
                               f3 = f3(dt),
                               check = "identical")
#> Unit: microseconds
#>     expr     min       lq      mean   median       uq     max neval
#>       f1  4781.0  5077.75  5647.333  5408.80  6163.50  8197.2   100
#>  r2evans 11009.1 11800.60 12759.094 12367.85 13037.30 20305.4   100
#>       f2   864.8   932.85  1005.163   968.20  1003.45  3874.0   100
#>       f3  1764.2  1866.10  2271.164  1979.55  2392.35  6685.0   100

Data:

dt <- data.table(
  gr1 = rep(LETTERS[1:2], each = 4),
  gr2 = rep(letters[3:6], each = 2),
  date1 = as.Date(c('2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04', '2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04')),
  date2 = as.Date(c('2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07', '2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07')),
  value = 1:8
)
  • Related