Home > OS >  Finding statistics after grouping in data.table
Finding statistics after grouping in data.table

Time:10-28

I had a small question in regards to data.table. Since i'm not so good at it i'm not quite sure how I can do this in data.table.

Basically I have 3 columns and want to group by the first two columns ( key and date ) and then for each key and each date, find the maximum and minimum that occurred in the third column ( fare)

I tried doing this but it gives me an error

flights[, c("max_day", "min_day") := unlist(lapply(gross_fare, findr)), by = c("key", "created_date")]

Error in `[.data.table`(flights, , `:=`(c("max_day", "min_day"), unlist(lapply(gross_fare,  : 
  Supplied 18 items to be assigned to group 1 of size 9 in column 'max_day'. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code.

findr is a function which just finds the max and min i.e.

findr <- function(x) {list(max = max(x), min = min(x)}

I've done what I want to do in dplyr and I'll attach the code for that, but since i have millions of rows, dplyr eats up my ram so data.table would help

test <- flights %>%
  select(key, created_date, gross_fare) %>%
  group_by(key, created_date) %>%
  summarise(
            max_day = max(gross_fare),
            min_day = min(gross_fare),
            diff = max_day - min_day) %>%
  arrange(created_date)

I've put the dput output if anyone wants to use that If anyone can help that'd be great, thank you :)

structure(list(key = c("LHE_KHI_LHE KHI_PA-405_15.0_1", "KHI_ISB_KHI ISB_PK-370_20.0_0", 
"LHE_KHI_LHE KHI_PK-307_20.0_0", "ISB_KHI_ISB KHI_PF-124_20.0_1", 
"LHE_KHI_LHE KHI_PK-307_20.0_0", "LHE_KHI_LHE KHI_PA-405_15.0_1", 
"KHI_LHE_KHI LHE_PK-304_20.0_0", "KHI_ISB_KHI ISB_PA-204_15.0_1", 
"ISB_KHI_ISB KHI_PA-207_15.0_1", "KHI_ISB_KHI ISB_PA-200_20.0_1", 
"KHI_LHE_KHI LHE_PK-304_40.0_0", "ISB_KHI_ISB KHI_PA-201_35.0_1", 
"ISB_KHI_ISB KHI_ER-501_20.0_1", "KHI_LHE_KHI LHE_PF-145_20.0_2", 
"KHI_ISB_KHI ISB_PA-204_20.0_1", "LHE_KHI_LHE KHI_PA-401_0.0_0", 
"ISB_KHI_ISB KHI_PK-309_40.0_0", "KHI_ISB_KHI ISB_PF-123_20.0_2", 
"ISB_KHI_ISB KHI_PA-205_15.0_1", "LHE_KHI_LHE KHI_PF-142_0.0_0", 
"ISB_KHI_ISB KHI_PA-223_15.0_1", "ISB_KHI_ISB KHI_PF-126_20.0_2", 
"ISB_KHI_ISB KHI_PK-309_20.0_0", "KHI_ISB_KHI ISB_PF-121_20.0_2", 
"ISB_KHI_ISB KHI_PK-373_20.0_0", "KHI_LHE_KHI LHE_PF-145_20.0_2", 
"KHI_LHE_KHI LHE_PA-402_15.0_1", "LHE_KHI_LHE KHI_PA-407_20.0_1", 
"KHI_ISB_KHI ISB_PK-308_40.0_0", "KHI_LHE_KHI LHE_PF-145_20.0_2", 
"LHE_KHI_LHE KHI_PF-144_0.0_0", "ISB_KHI_ISB KHI_PK-369_40.0_0", 
"ISB_KHI_ISB KHI_PF-124_20.0_2", "KHI_ISB_KHI ISB_PA-204_15.0_1", 
"KHI_ISB_KHI ISB_PA-200_15.0_1", "ISB_KHI_ISB KHI_PF-124_20.0_1", 
"KHI_ISB_KHI ISB_PK-300_20.0_0", "ISB_KHI_ISB KHI_PF-122_20.0_2", 
"KHI_ISB_KHI ISB_PK-368_20.0_0", "KHI_ISB_KHI ISB_PA-204_15.0_1", 
"ISB_KHI_ISB KHI_ER-503_20.0_1", "ISB_KHI_ISB KHI_PA-209_15.0_1", 
"KHI_ISB_KHI ISB_PK-308_40.0_0", "ISB_KHI_ISB KHI_PF-124_20.0_1", 
"ISB_KHI_ISB KHI_PK-301_40.0_0", "KHI_LHE_KHI LHE_PA-408_35.0_1", 
"LHE_KHI_LHE KHI_PF-144_20.0_2", "KHI_ISB_KHI ISB_PF-121_20.0_2", 
"KHI_ISB_KHI ISB_PA-204_35.0_1", "ISB_KHI_ISB KHI_PK-309_40.0_0", 
"ISB_KHI_ISB KHI_PA-223_20.0_1", "KHI_ISB_KHI ISB_PA-206_35.0_1", 
"LHE_KHI_LHE KHI_PF-142_32.0_1", "LHE_KHI_LHE KHI_PF-142_20.0_1", 
"KHI_ISB_KHI ISB_PF-123_20.0_2", "ISB_KHI_ISB KHI_PA-209_15.0_1", 
"KHI_ISB_KHI ISB_PA-204_35.0_1", "ISB_KHI_ISB KHI_PA-201_20.0_1", 
"KHI_ISB_KHI ISB_PK-368_20.0_0", "ISB_KHI_ISB KHI_PA-205_20.0_1", 
"KHI_ISB_KHI ISB_PF-121_20.0_1", "ISB_KHI_ISB KHI_PF-124_20.0_1", 
"ISB_KHI_ISB KHI_PA-205_15.0_1", "KHI_LHE_KHI LHE_PF-145_20.0_2", 
"KHI_LHE_KHI LHE_PA-406_35.0_1", "KHI_ISB_KHI ISB_PK-308_20.0_0", 
"LHE_KHI_LHE KHI_PA-401_20.0_1", "LHE_KHI_LHE KHI_PA-401_15.0_1", 
"KHI_ISB_KHI ISB_PA-204_35.0_1", "KHI_LHE_KHI LHE_PA-406_35.0_1", 
"KHI_ISB_KHI ISB_PA-206_35.0_1", "KHI_ISB_KHI ISB_PF-121_20.0_1", 
"ISB_KHI_ISB KHI_PA-205_20.0_1", "LHE_KHI_LHE KHI_PF-142_20.0_1", 
"LHE_KHI_LHE KHI_PF-146_20.0_2", "LHE_KHI_LHE KHI_PA-401_35.0_1", 
"ISB_KHI_ISB KHI_PA-209_15.0_1", "ISB_KHI_ISB KHI_PK-301_40.0_0", 
"ISB_KHI_ISB KHI_PA-205_35.0_1", "KHI_LHE_KHI LHE_PA-406_15.0_1", 
"KHI_ISB_KHI ISB_PF-123_20.0_1", "ISB_KHI_ISB KHI_PA-201_35.0_1", 
"KHI_ISB_KHI ISB_PK-300_40.0_0", "KHI_LHE_KHI LHE_PA-402_35.0_1", 
"ISB_KHI_ISB KHI_ER-505_20.0_1", "ISB_KHI_ISB KHI_PF-122_20.0_2", 
"ISB_KHI_ISB KHI_PA-207_15.0_1", "KHI_LHE_KHI LHE_PA-404_35.0_1", 
"KHI_ISB_KHI ISB_PF-123_20.0_1", "ISB_KHI_ISB KHI_ER-503_20.0_1", 
"ISB_GIL_ISB GIL_PK-605_20.0_0", "KHI_ISB_KHI ISB_PF-123_20.0_1", 
"KHI_ISB_KHI ISB_PA-200_15.0_1", "ISB_KHI_ISB KHI_PF-122_20.0_2", 
"KHI_LHE_KHI LHE_PA-404_35.0_1", "ISB_KHI_ISB KHI_PF-122_20.0_2", 
"PEW_KHI_PEW KHI_PF-152_20.0_1", "LHE_KHI_LHE KHI_PK-303_20.0_0", 
"KHI_ISB_KHI ISB_PA-222_35.0_1", "ISB_KHI_ISB KHI_PF-124_20.0_1"
), created_date = c("2021-04-20", "2021-05-27", "2021-02-13", 
"2021-08-14", "2021-08-11", "2021-08-21", "2021-01-26", "2021-08-21", 
"2021-05-24", "2021-09-15", "2021-06-05", "2021-07-19", "2021-09-29", 
"2021-07-02", "2021-08-10", "2021-01-04", "2021-07-15", "2021-07-14", 
"2021-08-13", "2021-01-11", "2021-09-13", "2021-09-20", "2021-05-27", 
"2021-02-20", "2021-08-15", "2021-07-27", "2021-08-26", "2021-09-15", 
"2021-08-02", "2021-06-25", "2021-05-15", "2021-08-26", "2021-07-30",
"2021-06-27", "2021-08-07", "2021-03-19", "2021-03-02", "2021-06-06", 
"2021-08-15", "2021-06-27", "2021-09-19", "2021-07-28", "2021-08-09", 
"2021-08-16", "2021-09-09", "2021-06-04", "2021-08-12", "2021-05-15", 
"2021-07-26", "2021-05-27", "2021-08-12", "2021-08-02", "2021-01-26", 
"2021-04-20", "2021-08-26", "2021-08-26", "2021-03-21", "2021-01-09", 
"2021-04-23", "2021-01-04", "2021-08-13", "2021-06-22", "2021-05-31", 
"2021-08-18", "2021-06-16", "2021-08-14", "2021-08-10", "2021-06-16", 
"2021-04-08", "2021-05-20", "2021-06-22", "2021-04-20", "2021-01-05", 
"2021-02-27", "2021-07-07", "2021-03-26", "2021-08-16", "2021-05-01", 
"2021-07-31", "2021-06-14", "2021-06-16", "2021-03-25", "2021-09-14", 
"2021-06-06", "2021-09-02", "2021-08-06", "2021-07-18", "2021-02-28", 
"2021-04-28", "2021-09-19", "2021-08-25", "2021-06-17", "2021-06-07", 
"2021-06-17", "2021-07-07", "2021-08-23", "2021-07-09", "2021-07-19", 
"2021-07-14", "2021-05-21"), gross_fare = c(7796, 7427, 11504, 
6870, 6580, 14945, 8697, 7524, 7124, 6785, 11858, 7524, 11500, 
9525, 6785, 8739, 8200, 13560, 9045, 7400, 7524, 12500, 7458, 
14000, 6570, 9525, 6220, 10545, 8310, 7900, 7820, 8410, 11285, 
19892, 6810, 9800, 11441, 11900, 6570, 13592, 11500, 8300, 20380, 
8525, 7340, 9707, 7870, 10655, 10545, 11798, 14645, 10545, 8650, 
8650, 7870, 12945, 10799, 10227, 6765, 10227, 20120, 11045, 9403, 
7870, 7124, 6570, 6810, 6531, 8605, 7124, 11072, 7390, 10227, 
13435, 10530, 12280, 18945, 11147, 10545, 6531, 6620, 10799, 
18480, 32702, 5606, 13560, 23895, 8027, 9655, 11500, 11990, 6620, 
9403, 7620, 14645, 19105, 9000, 6440, 12645, 8025)), row.names = c(NA, 
-100L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001eb382c1ef0>)
> 

CodePudding user response:

I guess this line of code should do the job:

library(data.table)

flights[,  .(min_day = min(gross_fare), max_day = max(gross_fare), diff = max(gross_fare) - min(gross_fare)), by = .(key, created_date)][]

CodePudding user response:

Since the function findr returns a list, there's no need to complicate things:

findr <- function(x) {list(max = max(x), min = min(x))}

flights[, c("max_day", "min_day") := findr(gross_fare), by = list(key, created_date)][]

To also return the difference between max and min, use

findr2 <- function(x) {
  list(max = max(x), min = min(x), diff = diff(range(x)))
}

flights[, c("max_day", "min_day", "diff_day") := findr2(gross_fare), by = list(key, created_date)][]
  • Related