Home > Mobile >  How to use `last()` when mutating by group with {dbplyr}?
How to use `last()` when mutating by group with {dbplyr}?

Time:12-05

Consider the following remote table:

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

I wish to group by grp, order by win and take the last id. This is fairly straightforward if I collect first

# intended output when collecting first
remote_data %>% 
  collect() %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup()
#> # A tibble: 7 × 4
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2

However I cannot directly convert this to {dbplyr} code by removing collect(), though the SQL code doesn't look bad, what's happening here ?

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

dbplyr::window_order() allows us to override th ORDER BY clause created by the group_by(), I tried window_order(,win), but no cookie:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

For some reason window_order(,grp) does trigger a window calculation but not with the expected order:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       5
#> 6     2 C         3       5
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

What can I do to keep my initial output with only remote computations, preferably {dbplyr} code ?

CodePudding user response:

Here is a work around using a join, but that's not very satisfying, possibly inefficient too:

lkp <- remote_data %>% 
  group_by(grp) %>% 
  filter(win == max(win, na.rm = TRUE)) %>% 
  ungroup() %>% 
  select(grp,  last_id = id) %>% 
  distinct()

remote_data %>% 
  left_join(lkp, by = "grp") %>% 
  arrange(grp, win) %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *
#> FROM (
#>   SELECT `LHS`.`grp` AS `grp`, `win`, `id`, `last_id`
#>   FROM `dbplyr_001` AS `LHS`
#>   LEFT JOIN (
#>     SELECT DISTINCT `grp`, `id` AS `last_id`
#>     FROM (
#>       SELECT `grp`, `win`, `id`
#>       FROM (
#>         SELECT *, MAX(`win`) OVER (PARTITION BY `grp`) AS `q01`
#>         FROM `dbplyr_001`
#>       )
#>       WHERE (`win` = `q01`)
#>     )
#>   ) AS `RHS`
#>     ON (`LHS`.`grp` = `RHS`.`grp`)
#> )
#> ORDER BY `grp`, `win`

Created on 2022-12-04 with reprex v2.0.2

CodePudding user response:

While last() seems to be broken first() appears to work as expected, so you could make use of the order_by argument to get the last value:

library(dbplyr)
library(dplyr, w = F)

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = first(id, order_by = desc(win))) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()

# Source:     SQL [7 x 4]
# Database:   sqlite 3.39.4 [:memory:]
# Ordered by: grp, win
    grp win      id last_id
  <dbl> <chr> <dbl>   <dbl>
1     1 A         4       6
2     1 B         7       6
3     1 C         6       6
4     2 A         5       3
5     2 B         1       3
6     2 C         3       3
7     3 C         2       2
<SQL>
SELECT
  *,
  FIRST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` DESC) AS `last_id`
FROM `dbplyr_001`
ORDER BY `grp`, `win`
  • Related