4

Consider the following remote table:

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

I wish to group by grp, order by win and take the last id. This is fairly straightforward if I collect first

# intended output when collecting first
remote_data %>% 
  collect() %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup()
#> # A tibble: 7 × 4
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2

However I cannot directly convert this to {dbplyr} code by removing collect(), though the SQL code doesn't look bad, what's happening here ?

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

dbplyr::window_order() allows us to override th ORDER BY clause created by the group_by(), I tried window_order(,win), but no cookie:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       4
#> 2     1 B         7       7
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       1
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

For some reason window_order(,grp) does trigger a window calculation but not with the expected order:

remote_data %>% 
  arrange(grp, win) %>% 
  group_by(grp) %>% 
  window_order(grp) %>% 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       5
#> 5     2 B         1       5
#> 6     2 C         3       5
#> 7     3 C         2       2
#> <SQL>
#> SELECT *, LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `grp`) AS `last_id`
#> FROM `dbplyr_001`
#> ORDER BY `grp`, `win`

What can I do to keep my initial output with only remote computations, preferably {dbplyr} code ?

moodymudskipper
  • 46,417
  • 11
  • 121
  • 167

3 Answers3

3

It seems like you need to use window_frame():

library(dbplyr)
library(dplyr, w = F)
remote_data <- memdb_frame(
  grp = c(2, 2, 2, 1, 3, 1, 1),
  win = c("B", "C", "A", "B", "C", "A", "C"),
  id = c(1,3,5,7,2,4,6),
)

remote_data %>% 
  group_by(grp) %>% 
  window_order(win) %>% 
  window_frame() |> 
  mutate(last_id = last(id)) %>% 
  ungroup() %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT
#>   *,
#>   LAST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `win` ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS `last_id`
#> FROM `dbplyr_001`

Created on 2022-12-05 with reprex v2.0.2

This feels like a bug in dbplyr to me. Can you please open an issue in the dbplyr repo? Then I can fix this for the next dbplyr release.

2

While last() seems to be broken first() appears to work as expected, so you could make use of the order_by argument and row_number() to get the last value. You also need to move arrange() further down the pipeline and use window_order() in its place:

library(dbplyr)
library(dplyr, w = F)

remote_data %>% 
  window_order(grp, win) %>% 
  group_by(grp) %>% 
  mutate(rn = row_number(),
         last_id = first(id, order_by = desc(rn))) %>% 
  arrange(grp, win, rn) %>%
  ungroup() %>% 
  select(-rn) %>%
  print() %>%
  show_query()

# Source:     SQL [7 x 4]
# Database:   sqlite 3.39.4 [:memory:]
# Ordered by: grp, win, rn
    grp win      id last_id
  <dbl> <chr> <dbl>   <dbl>
1     1 A         4       6
2     1 B         7       6
3     1 C         6       6
4     2 A         5       3
5     2 B         1       3
6     2 C         3       3
7     3 C         2       2
<SQL>
SELECT
  `grp`,
  `win`,
  `id`,
  FIRST_VALUE(`id`) OVER (PARTITION BY `grp` ORDER BY `rn` DESC) AS `last_id`
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY `grp` ORDER BY `grp`, `win`) AS `rn`
  FROM `dbplyr_008`
)
ORDER BY `grp`, `win`, `rn`
Ritchie Sacramento
  • 29,890
  • 4
  • 48
  • 56
  • 1
    Thanks Ritchie, I'm ticking the other answer since it uses last() and is more canonical since it's from the package maintainer but it was clever and helpful. – moodymudskipper Dec 05 '22 at 22:04
1

Here is a work around using a join, but that's not very satisfying, possibly inefficient too:

lkp <- remote_data %>% 
  group_by(grp) %>% 
  filter(win == max(win, na.rm = TRUE)) %>% 
  ungroup() %>% 
  select(grp,  last_id = id) %>% 
  distinct()

remote_data %>% 
  left_join(lkp, by = "grp") %>% 
  arrange(grp, win) %>% 
  print() %>% 
  show_query()
#> # Source:     SQL [7 x 4]
#> # Database:   sqlite 3.39.4 [:memory:]
#> # Ordered by: grp, win
#>     grp win      id last_id
#>   <dbl> <chr> <dbl>   <dbl>
#> 1     1 A         4       6
#> 2     1 B         7       6
#> 3     1 C         6       6
#> 4     2 A         5       3
#> 5     2 B         1       3
#> 6     2 C         3       3
#> 7     3 C         2       2
#> <SQL>
#> SELECT *
#> FROM (
#>   SELECT `LHS`.`grp` AS `grp`, `win`, `id`, `last_id`
#>   FROM `dbplyr_001` AS `LHS`
#>   LEFT JOIN (
#>     SELECT DISTINCT `grp`, `id` AS `last_id`
#>     FROM (
#>       SELECT `grp`, `win`, `id`
#>       FROM (
#>         SELECT *, MAX(`win`) OVER (PARTITION BY `grp`) AS `q01`
#>         FROM `dbplyr_001`
#>       )
#>       WHERE (`win` = `q01`)
#>     )
#>   ) AS `RHS`
#>     ON (`LHS`.`grp` = `RHS`.`grp`)
#> )
#> ORDER BY `grp`, `win`

Created on 2022-12-04 with reprex v2.0.2

moodymudskipper
  • 46,417
  • 11
  • 121
  • 167