I am dplyr user. I am constantly using its joins, piping function, group_by, and summarise functions but one-day data.table came along took a large data set that I was working on and reduced the time for my calculation by 87%. Now I want to switch from dplyr to data.table. However, in my code, I have the following table:
structure(list(tariff_label = c("tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tariff", "tariff", "tariff", "tariff", "tariff",
"tariff", "tariff", "tariff", "tariff", "tariff", "tariff", "tariff",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tv_special", "tv_special", "tv_special",
"tv_special", "tv_special", "tariff", "tariff", "tariff", "tariff",
"tariff", "tariff", "tariff", "tariff"), d = c("7", "7", "7",
"7", "7", "7", "7", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3",
"3", "3", "3"), h = c("17", "18", "19", "20", "21", "22", "23",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22", "23", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15"), id = c(1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
71), period = c("day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day", "day", "day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day", "day", "day", "day", "day", "night", "night", "night",
"night", "night", "night", "night", "night", "night", "night",
"night", "night", "day", "day", "day", "day", "day", "day", "day",
"day"), week_period = c("weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "weekend",
"weekend", "weekend", "weekend", "weekend", "weekend", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day", "week day", "week day", "week day", "week day", "week day",
"week day"), center_id = c("12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc",
"12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc", "12_aDb4_09083641_aaHcc"
), network_price = c(1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85, 3.85,
3.85, 3.85, 3.85, 3.85, 3.85, 3.85, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 3.85, 3.85, 3.85, 3.85, 3.85,
3.85, 3.85, 3.85), group_id = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L), .Label = c("1", "2", "3"), class = "factor")), row.names = c(NA,
-71L), class = c("data.table", "data.frame"))
I then did the following calculations
df$network_price <- as.numeric(df$network_price)
df$group_id<-factor(df$group_id)
df$id<-as.numeric(df$id)
df<-data.table(df)
daily_cap<- 13.50
# Main function in regards to my question
msub_list<- df %>% group_by(group_id) %>%
summarise(cum_sum = list(cumsum(network_price)),
exceeded = list(unlist(cum_sum) > daily_cap),
exceeded_indicator = any(unlist(exceeded)),
last_false = sum(!unlist(exceeded)),
start_group = range(id)[1],
end_group = start_group last_false - 1 exceeded_indicator,
subdf = list(df[start_group:end_group,]),
difference = daily_cap - unlist(cum_sum)[last_false]
)
My goal is to now implement the same calculation in data.table for what I did in msub_list
. Notice how the columns make calculations using previous columns. For example, the column exceeded uses the newly created column cum_sum and so on. I tried to implement this with data.table but I got the error object cum_sum is not found. Is there a way to implement this in data.table. Just for your info, I did this calculation in this way so as to avoid using a loop for these calculations, and when I say a loop I mean using a for loop or by splitting the data table and using lapply()
. This has greatly sped up my code and I am only looking for solutions with data.table. I tried searching online for this but I can't find any solutions. Here is how my attempt looked like
# Attempt with data.table
msub_list <- x[, .(cum_sum = list(cumsum(tariff_price)),
exceeded = list(unlist(cum_sum) > daily_cap),
exceeded_indicator = any(unlist(exceeded)),
last_false = sum(!unlist(exceeded)),
start_group = range(id)[1],
end_group = start_group last_false - 1 exceeded_indicator,
subdf = list(x[start_group:end_group,]),
difference = daily_cap - unlist(cum_sum)[last_false]
),
by = group_id]
CodePudding user response:
You can use {}
to create temporary columns, and then choose which ones to keep.
The below gets you the same information as the dplyr pipeline
df[, {
cum_sum = list(list(cumsum(network_price)));
exceeded = list(list(unlist(cum_sum)>daily_cap));
exceeded_indicator = any(unlist(exceeded));
start_group = range(id)[1];
last_false=sum(unlist(exceeded)==F);
end_group = start_group last_false-1 exceeded_indicator;
subdf = list(list(df[start_group:end_group,]));
difference = daily_cap - unlist(cum_sum)[last_false];
list(cum_sum = cum_sum,
exceeded =exceeded,
exceeded_indicator = exceeded_indicator,
start_group = start_group,
last_false = last_false,
end_group = end_group,
subdf = subdf,
difference = difference)
}, by=group_id]
Output:
group_id cum_sum exceeded exceeded_indicator start_group last_false end_group subdf difference
1: 1 <list[1]> <list[1]> TRUE 1 9 10 <list[1]> 0.90
2: 2 <list[1]> <list[1]> TRUE 25 9 34 <list[1]> 0.90
3: 3 <list[1]> <list[1]> TRUE 49 4 53 <list[1]> 0.55
CodePudding user response:
You could look at the table.express
package,
although it doesn't overcome all limitations so nicely.
In this case, it cannot easily use newly created columns while summarising,
but you could use select
to provide an expression like in langtang's answer:
library(table.express)
ans <- df %>%
as.data.table %>%
mutate(
network_price = as.numeric(network_price),
group_id = factor(group_id),
id = as.integer(id)
) %>%
group_by(group_id) %>%
select({
cum_sum = cumsum(network_price)
exceeded = cum_sum > daily_cap
exceeded_indicator = any(exceeded)
start_group = min(id)
last_false = sum(!exceeded)
end_group = start_group last_false - 1L exceeded_indicator
difference = daily_cap - cum_sum[last_false]
list(
cum_sum = list(cum_sum),
exceeded = list(exceeded),
exceeded_indicator = exceeded_indicator,
start_group = start_group,
end_group = end_group,
last_false = last_false,
difference = difference,
subdf = list(df[start_group:end_group, ])
)
})
Otherwise you could go the "long" verbose way, although you do end up with fewer nested lists that might not be necessary:
ans <- df %>%
as.data.table %>%
mutate(
network_price = as.numeric(network_price),
group_id = factor(group_id),
id = as.integer(id)
) %>%
key_by(group_id) %>%
transmute(cum_sum = cumsum(network_price), id) %>%
mutate(exceeded = cum_sum > daily_cap) %>%
key_by(group_id) %>%
summarise(
exceeded_indicator = any(exceeded),
start_group = min(id),
last_false = sum(!exceeded),
difference = daily_cap - cum_sum[sum(!exceeded)]
) %>%
mutate(end_group = start_group last_false - 1L exceeded_indicator) %>%
key_by(group_id) %>%
mutate(subdf = list(df[start_group:end_group, ]))
I use key_by
here because I think that might help optimise the repeated grouped calls.