In dplyr summarise() I can build off of newly created columns, how can I do this in data.table?-CodePudding

I am dplyr user. I am constantly using its joins, piping function, group_by, and summarise functions but one-day data.table came along took a large data set that I was working on and reduced the time for my calculation by 87%. Now I want to switch from dplyr to data.table. However, in my code, I have the following table:

structure(list(tariff_label = c("tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tariff", "tariff", "tariff", "tariff", "tariff", 
"tariff", "tariff", "tariff", "tariff", "tariff", "tariff", "tariff", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special", 
"tv_special", "tv_special", "tariff", "tariff", "tariff", "tariff", 
"tariff", "tariff", "tariff", "tariff"), d = c("7", "7", "7", 
"7", "7", "7", "7", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", 
"3", "3", "3"), h = c("17", "18", "19", "20", "21", "22", "23", 
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", 
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", 
"23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", 
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", 
"22", "23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
"10", "11", "12", "13", "14", "15"), id = c(1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 
71), period = c("day", "day", "day", "night", "night", "night", 
"night", "night", "night", "night", "night", "night", "night", 
"night", "night", "day", "day", "day", "day", "day", "day", "day", 
"day", "day", "day", "day", "day", "night", "night", "night", 
"night", "night", "night", "night", "night", "night", "night", 
"night", "night", "day", "day", "day", "day", "day", "day", "day", 
"day", "day", "day", "day", "day", "night", "night", "night", 
"night", "night", "night", "night", "night", "night", "night", 
"night", "night", "day", "day", "day", "day", "day", "day", "day", 
"day"), week_period = c("weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend", 
"weekend", "weekend", "weekend", "weekend", "weekend", "week day", 
"week day", "week day", "week day", "week day", "week day", "week day", 
"week day", "week day", "week day", "week day", "week day", "week day", 
"week day", "week day", "week day", "week day", "week day", "week day", 
"week day", "week day", "week day", "week day", "week day", "week day", 
"week day", "week day", "week day", "week day", "week day", "week day", 
"week day"), center_id = c("12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", 
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc"
), network_price = c(1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 
1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85, 3.85, 
3.85, 3.85, 3.85, 3.85, 3.85, 3.85, 1.4, 1.4, 1.4, 1.4, 1.4, 
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85, 
3.85, 3.85, 3.85), group_id = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L), .Label = c("1", "2", "3"), class = "factor")), row.names = c(NA, 
-71L), class = c("data.table", "data.frame"))

I then did the following calculations

df$network_price <- as.numeric(df$network_price)
df$group_id<-factor(df$group_id)
df$id<-as.numeric(df$id)
df<-data.table(df)
daily_cap<- 13.50

# Main function in regards to my question
msub_list<- df %>% group_by(group_id) %>% 
  summarise(cum_sum = list(cumsum(network_price)),
            exceeded = list(unlist(cum_sum) > daily_cap),
            exceeded_indicator = any(unlist(exceeded)),
            last_false = sum(!unlist(exceeded)),
            start_group = range(id)[1],
            end_group = start_group   last_false - 1   exceeded_indicator,
            subdf = list(df[start_group:end_group,]),
            difference = daily_cap - unlist(cum_sum)[last_false]
  )

My goal is to now implement the same calculation in data.table for what I did in msub_list. Notice how the columns make calculations using previous columns. For example, the column exceeded uses the newly created column cum_sum and so on. I tried to implement this with data.table but I got the error object cum_sum is not found. Is there a way to implement this in data.table. Just for your info, I did this calculation in this way so as to avoid using a loop for these calculations, and when I say a loop I mean using a for loop or by splitting the data table and using lapply(). This has greatly sped up my code and I am only looking for solutions with data.table. I tried searching online for this but I can't find any solutions. Here is how my attempt looked like

# Attempt with data.table
msub_list <- x[, .(cum_sum = list(cumsum(tariff_price)),
                      exceeded = list(unlist(cum_sum) > daily_cap),
                      exceeded_indicator = any(unlist(exceeded)),
                      last_false = sum(!unlist(exceeded)),
                      start_group = range(id)[1],
                      end_group = start_group   last_false - 1   exceeded_indicator,
                      subdf = list(x[start_group:end_group,]),
                      difference = daily_cap - unlist(cum_sum)[last_false]
                      ),
                   by = group_id]

CodePudding user response：

You can use {} to create temporary columns, and then choose which ones to keep.

The below gets you the same information as the dplyr pipeline

df[, {
  cum_sum  = list(list(cumsum(network_price)));
  exceeded = list(list(unlist(cum_sum)>daily_cap));
  exceeded_indicator = any(unlist(exceeded));
  start_group = range(id)[1];
  last_false=sum(unlist(exceeded)==F);
  end_group = start_group last_false-1 exceeded_indicator;
  subdf = list(list(df[start_group:end_group,]));
  difference = daily_cap - unlist(cum_sum)[last_false];
  list(cum_sum = cum_sum,
       exceeded =exceeded,
       exceeded_indicator = exceeded_indicator,
       start_group = start_group,
       last_false = last_false,
       end_group = end_group,
       subdf = subdf,
       difference = difference)
}, by=group_id]

Output:

   group_id   cum_sum  exceeded exceeded_indicator start_group last_false end_group     subdf difference
1:        1 <list[1]> <list[1]>               TRUE           1          9        10 <list[1]>       0.90
2:        2 <list[1]> <list[1]>               TRUE          25          9        34 <list[1]>       0.90
3:        3 <list[1]> <list[1]>               TRUE          49          4        53 <list[1]>       0.55

CodePudding user response：

You could look at the table.express package, although it doesn't overcome all limitations so nicely. In this case, it cannot easily use newly created columns while summarising, but you could use select to provide an expression like in langtang's answer:

library(table.express)

ans <- df %>%
    as.data.table %>%
    mutate(
        network_price = as.numeric(network_price),
        group_id = factor(group_id),
        id = as.integer(id)
    ) %>%
    group_by(group_id) %>%
    select({
        cum_sum  = cumsum(network_price)
        exceeded = cum_sum > daily_cap
        exceeded_indicator = any(exceeded)
        start_group = min(id)
        last_false = sum(!exceeded)
        end_group = start_group   last_false - 1L   exceeded_indicator
        difference = daily_cap - cum_sum[last_false]
        list(
            cum_sum = list(cum_sum),
            exceeded = list(exceeded),
            exceeded_indicator = exceeded_indicator,
            start_group = start_group,
            end_group = end_group,
            last_false = last_false,
            difference = difference,
            subdf = list(df[start_group:end_group, ])
        )
    })

Otherwise you could go the "long" verbose way, although you do end up with fewer nested lists that might not be necessary:

ans <- df %>%
    as.data.table %>%
    mutate(
        network_price = as.numeric(network_price),
        group_id = factor(group_id),
        id = as.integer(id)
    ) %>%
    key_by(group_id) %>%
    transmute(cum_sum = cumsum(network_price), id) %>%
    mutate(exceeded = cum_sum > daily_cap) %>%
    key_by(group_id) %>%
    summarise(
        exceeded_indicator = any(exceeded),
        start_group = min(id),
        last_false = sum(!exceeded),
        difference = daily_cap - cum_sum[sum(!exceeded)]
    ) %>%
    mutate(end_group = start_group   last_false - 1L   exceeded_indicator) %>%
    key_by(group_id) %>%
    mutate(subdf = list(df[start_group:end_group, ]))

I use key_by here because I think that might help optimise the repeated grouped calls.