Consider the following remote table:
library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
grp = c(2, 2, 2, 1, 3, 1, 1),
win = c("B", "C", "A", "B", "C", "A", "C"),
id = c(1,3,5,7,2,4,6),
)
I wish to group by grp
, order by win
and take the last id.
This is fairly straightforward if I collect first
# intended output when collecting first
remote_data %>%
collect() %>%
arrange(grp, win) %>%
group_by(grp) %>%
mutate(last_id = last(id)) %>%
ungroup()
#> # A tibble: 7 × 4
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 3
#> 5 2 B 1 3
#> 6 2 C 3 3
#> 7 3 C 2 2
However I cannot directly convert this to {dbplyr} code by removing collect()
, though the SQL code doesn't look bad, what's happening here ?
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 4
#> 2 1 B 7 7
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 1
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT
#> *,
#> LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
dbplyr::window_order()
allows us to override th ORDER BY clause created by the group_by(), I tried window_order(,win)
, but no cookie:
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
window_order(win) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 4
#> 2 1 B 7 7
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 1
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
For some reason window_order(,grp)
does trigger a window calculation but not with the expected order:
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
window_order(grp) %>%
mutate(last_id = last(id)) %>%
ungroup() %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 5
#> 5 2 B 1 5
#> 6 2 C 3 5
#> 7 3 C 2 2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`
What can I do to keep my initial output with only remote computations, preferably {dbplyr} code ?
CodePudding user response:
Here is a work around using a join, but that's not very satisfying, possibly inefficient too:
lkp <- remote_data %>%
group_by(grp) %>%
filter(win == max(win, na.rm = TRUE)) %>%
ungroup() %>%
select(grp, last_id = id) %>%
distinct()
remote_data %>%
left_join(lkp, by = "grp") %>%
arrange(grp, win) %>%
print() %>%
show_query()
#> # Source: SQL [7 x 4]
#> # Database: sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#> grp win id last_id
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 4 6
#> 2 1 B 7 6
#> 3 1 C 6 6
#> 4 2 A 5 3
#> 5 2 B 1 3
#> 6 2 C 3 3
#> 7 3 C 2 2
#> <SQL>
#> SELECT *
#> FROM (
#> SELECT `LHS`.`grp` AS `grp`, `win`, `id`, `last_id`
#> FROM `dbplyr_001` AS `LHS`
#> LEFT JOIN (
#> SELECT DISTINCT `grp`, `id` AS `last_id`
#> FROM (
#> SELECT `grp`, `win`, `id`
#> FROM (
#> SELECT *, MAX(`win`) OVER (PARTITION BY `grp`) AS `q01`
#> FROM `dbplyr_001`
#> )
#> WHERE (`win` = `q01`)
#> )
#> ) AS `RHS`
#> ON (`LHS`.`grp` = `RHS`.`grp`)
#> )
#> ORDER BY `grp`, `win`
Created on 2022-12-04 with reprex v2.0.2
CodePudding user response:
While last()
seems to be broken first()
appears to work as expected, so you could make use of the order_by
argument to get the last value:
library(dbplyr)
library(dplyr, w = F)
remote_data %>%
arrange(grp, win) %>%
group_by(grp) %>%
mutate(last_id = first(id, order_by = desc(win))) %>%
ungroup() %>%
print() %>%
show_query()
# Source: SQL [7 x 4]
# Database: sqlite 3.39.4 [:memory:]
# Ordered by: grp, win
grp win id last_id
<dbl> <chr> <dbl> <dbl>
1 1 A 4 6
2 1 B 7 6
3 1 C 6 6
4 2 A 5 3
5 2 B 1 3
6 2 C 3 3
7 3 C 2 2
<SQL>
SELECT
*,
FIRST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` DESC) AS `last_id`
FROM `dbplyr_001`
ORDER BY `grp`, `win`