3

How would I implement method dispatch for a function inside of dplyr::do?

I've read through GitHub issues #719, #3558 and #3429 which have helpful information on how to create methods for dplyr verbs, but nothing in particular that works for dplyr::do - which is sort of "special" in the sense that the dispatch not only needs to happen for dplyr:do itself, but also for the function that is called inside dplyr::do (or at least that's what I'm after)

Here's what I tried:

Preliminaries

library(dplyr)
#> 
#> Attache Paket: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# Example data ------------------------------------------------------------

df <- tibble::tibble(
  id = c(rep("A", 5), rep("B", 5)),
  x = 1:10
)

df_custom <- df
class(df_custom) <- c("tbl_df_custom", class(df_custom))

# Reclass function --------------------------------------------------------

reclass <- function(x, result) {
  UseMethod('reclass')
}

reclass.default <- function(x, result) {
  class(result) <- unique(c(class(x)[[1]], class(result)))
  attr(result, class(x)[[1]]) <- attr(x, class(x)[[1]])
  result
}

Step 1: try to define a method for a dplyr verb

# Custom method for summarize ---------------------------------------------

summarise.tbl_df_custom <- function (.data, ...) {
  message("Custom method for `summarise`")
  result <- NextMethod("summarise")
  ret <- reclass(.data, result)
  print(class(ret))
  ret
}

ret <- df_custom %>%
  summarise(y = mean(x))
#> Custom method for `summarise`
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"
ret %>% class()
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"

Step 2: try to define a method for a another dplyr verb to test longer pipe

# Custom method for group_by ----------------------------------------------

group_by.tbl_df_custom <- function (.data, ..., add = FALSE) {
  message("Custom method for `group_by`")
  result <- NextMethod("group_by")
  ret <- reclass(.data, result)
  print(class(ret))
  ret
}

ret <- df_custom %>%
  group_by(id) %>%
  summarise(y = mean(x))
#> Custom method for `group_by`
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"
#> Custom method for `summarise`
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"
ret %>% class()
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"

Step 3: trying the same for do

# Custom method for do ----------------------------------------------------

do.tbl_df_custom <- function (.data, ...) {
  message("custom method for `do`")
  result <- NextMethod("do")
  ret <- reclass(.data, result)
  print(class(ret))
  ret
}

foo <- function(df) {
  UseMethod("foo")
}

foo.default <- function(df) {
  message("Default method for `foo`")
  df %>%
    summarise(y = mean(x))
}

foo.tbl_df_custom <- function(df) {
  message("Custom method for `foo`")
  df %>%
    summarise(y = mean(x) * 100)
}

ret <- df_custom %>%
  group_by(id) %>%
  do(foo(.))
#> Custom method for `group_by`
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"
#> custom method for `do`
#> Default method for `foo`
#> Default method for `foo`
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"
ret
#> # A tibble: 2 x 2
#> # Groups:   id [2]
#>   id        y
#>   <chr> <dbl>
#> 1 A         3
#> 2 B         8
ret %>% class()
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"

While this looks okay on first sight, the problem is that the default instead of the custom method for foo is called.

Created on 2019-01-08 by the reprex package (v0.2.1)

Rappster
  • 12,762
  • 7
  • 71
  • 120
  • 1
    Fascinating issue, I can't track down the problem but here is some potentially useful info. The custom `do` function works fine for ungrouped dataframes: `df_custom %>% do(foo(.))`. It's only when you `group_by` that the problem arises. Grouped dataframes are handled differently internally in `dplyr`. Notice how the code prints "Default method for `foo`" twice. It's because there's a double `for` loop inside the `do.grouped_df` function which takes slices of your dataframe by group. When these slices are taken, the special class you define is lost. Check out `class(df_custom[1,])` – astrofunkswag Jan 08 '19 at 01:36
  • 2
    `group_by` calls `ungroup`, another generic, that strips off the custom class. Maybe an `ungroup.tbl_df_custom` method is needed. – user2554330 Jan 08 '19 at 01:54

2 Answers2

1

So the issue is related to this question I just asked. I was able to solve it by defining 3 new funtions: ungroup.tbl_df_custom, a class constructor function, and [.tbl_df_custom.

ungroup.tbl_df_custom <- function (.data, ...) {
  message("custom method for `ungroup`")
  result <- NextMethod("ungroup")
  ret <- reclass(.data, result)
  ret
}


new_custom <- function(x, ...) {

  structure(x, class = c("tbl_df_custom", class(x)))
}

`[.tbl_df_custom` <- function(x, ...) {
  new_custom(NextMethod())
}



df_custom2 <- new_custom(df)


df_custom2 %>%
  group_by(id) %>%
  do(foo(.))

Custom method for `group_by`
[1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"           "data.frame"   
custom method for `do`
custom method for `ungroup`
Custom method for `foo`
Custom method for `summarise`
[1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"   
Custom method for `foo`
Custom method for `summarise`
[1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"   
[1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"           "data.frame"   
custom method for `ungroup`
# A tibble: 2 x 2
# Groups:   id [2]
  id        y
  <chr> <dbl>
1 A       300
2 B       800
astrofunkswag
  • 2,608
  • 12
  • 25
  • Thanks man, totally missed the `ungroup` aspect! I'll accept your answer. For the pure sake of a complete and self-contained reference, I'll post a own answer that has all the code from start to finish for my particular example. – Rappster Jan 08 '19 at 09:47
  • As I was reading through the chapter [S3 inheritance in advanced R](https://adv-r.hadley.nz/s3.html#s3-inheritance), I noticed that I can swap my `reclass` function for `vctrs::vec_restore` which also has a method for `data.frames` (see `library(vctrs); sloop::s3_methods_generic("vec_restore")`. Apparantly, this also makes the `[.tbl_df_custom` method obsolete and thus simplifies things a bit. – Rappster Jan 08 '19 at 09:52
  • I was wrong about being able to drop `[.tbl_df_custom`, it is in fact crucial. Thanks again for coming up with the answer, learned a lot today :-) – Rappster Jan 08 '19 at 13:55
1

For the pure sake of having a complete and self-contained example with all the code from start to finish for my particular example, I'll also post an own answer here.

A couple of things to highlight:

  1. Except for my custom method for group_by(), I could swap reclass() for the much better vctrs::vec_restore(), which also happens to have a data.frame method (see library(vctrs); sloop::s3_methods_generic("vec_restore")).

    You can find more info on vctrs::vec_restore() in chapter S3 inheritance of Advanced R as well as as the S3 vectors article on https://vctrs.r-lib.org/

    It'd be great if there was something like a combine argument in vctrs::vec_restore() to make it consider the grouped_df() class attribute that is added by calling the default method of group_by(), but that's another story (for which I filed an inquisitive GitHub issue).

    Currently, our custom class info would be dropped due to the way vctrs::vec_restore() is implemented (see "Testing things out" below).

  2. GitHub issues I found very helpful: #3429 and especially #3923

Code

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# Constructor for tbl_df_custom class -------------------------------------

new_df_custom <- function(x = tibble()) {
  stopifnot(tibble::is_tibble(x))
  structure(x, class = c("tbl_df_custom", class(x)))
}

# Example data ------------------------------------------------------------

df_custom <- new_df_custom(
  x = tibble::tibble(
    id = c(rep("A", 3), rep("B", 3)),
    x = 1:6
  )
)

df_custom
#> # A tibble: 6 x 2
#>   id        x
#> * <chr> <int>
#> 1 A         1
#> 2 A         2
#> 3 A         3
#> 4 B         4
#> 5 B         5
#> 6 B         6
df_custom %>% class()
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"

# Reclass function for preserving custom class attribute ------------------

reclass <- function(x, to) {
  UseMethod('reclass')
}

reclass.default <- function(x, to) {
  class(x) <- unique(c(class(to)[[1]], class(x)))
  attr(x, class(to)[[1]]) <- attr(to, class(to)[[1]])
  x
}

# Custom method for summarise ---------------------------------------------

summarise.tbl_df_custom <- function (.data, ...) {
  message("Custom method for `summarise`")
  vctrs::vec_restore(NextMethod(), .data)
}

# Custom method for group_by ----------------------------------------------

group_by.tbl_df_custom <- function (.data, ..., add = FALSE, 
  use_vec_restore = FALSE
) {
  message("Custom method for `group_by`")
  retval <- reclass(NextMethod(), .data)
  print(class(retval))
  retval
}

# Custom method for ungroup ----------------------------------------------

ungroup.tbl_df_custom <- function (.data, ...) {
  message("custom method for `ungroup`")
  vctrs::vec_restore(NextMethod(), .data)
}

# Custom method for do ----------------------------------------------------

do.tbl_df_custom <- function (.data, ...) {
  message("custom method for `do`")
  vctrs::vec_restore(NextMethod(), .data)
}

# Custom extraction method ------------------------------------------------

`[.tbl_df_custom` <- function(x, ...) {
  message("custom method for `[`")
  new_df_custom(NextMethod())
}

# Create custom methods for foo -------------------------------------------

foo <- function(df) {
  UseMethod("foo")
}

foo.default <- function(df) {
  message("Default method for `foo`")
  df %>%
    summarise(y = mean(x))
}

foo.tbl_df_custom <- function(df) {
  message("Custom method for `foo`")
  df %>%
    summarise(y = mean(x) * 100)
}

# Testing things out ------------------------------------------------------

retval <- df_custom %>%
  group_by(id) %>%
  do(foo(.))
#> Custom method for `group_by`
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"
#> custom method for `do`
#> custom method for `ungroup`
#> custom method for `[`
#> Custom method for `foo`
#> Custom method for `summarise`
#> custom method for `[`
#> Custom method for `foo`
#> Custom method for `summarise`

retval
#> custom method for `[`
#> custom method for `ungroup`
#> # A tibble: 2 x 2
#> # Groups:   id [2]
#>   id        y
#>   <chr> <dbl>
#> 1 A       200
#> 2 B       500
retval %>% class()
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"

Created on 2019-01-08 by the reprex package (v0.2.1)

Alternative to reclass(): vctrs::vec_restore()

# Alternative version for group_by that uses vctrs::vec_restore -----------

group_by.tbl_df_custom <- function (.data, ..., add = FALSE) {
  message("Custom method for `group_by`")
  retval <- vctrs::vec_restore(NextMethod(), .data)
  print(class(retval))
  retval
}

retval <- df_custom %>%
  group_by(id) %>%
  do(foo(.))
#> Custom method for `group_by`
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"
#> custom method for `do`
#> Custom method for `foo`
#> Custom method for `summarise`

retval
#> custom method for `[`
#> # A tibble: 1 x 1
#>       y
#>   <dbl>
#> 1   350
retval %>% class()
#> [1] "tbl_df_custom" "tbl_df"        "tbl"           "data.frame"

Created on 2019-01-08 by the reprex package (v0.2.1)

As mentioned above, note that when using the alternative version of group_by() that uses vctrs::vec_restore() instead of reclass(), the class attribute grouped_df is dropped.

Alternative to reclass(): vec_restore_inclusive()

This is an own implementation that tries to leverage the way vctrs::vec_restore() works while also considering attributes of to in the decision of how the "reset" is carried out. Arguably, "combine" or "align" would be better name components for the function.

vec_restore_inclusive <- function(x, to) {
  UseMethod('vec_restore_inclusive')
}

vec_restore_inclusive.data.frame <- function (x, to) {
  attr_to <- attributes(to)
  attr_x <- attributes(x)
  attr_use <- if (
    length(classes_preserve <- setdiff(attr_to[["class"]], attr_x[["class"]]))
  ) {
    attr_x
  } else {
    attr_to
  }

  attr_use[["names"]] <- attr_x[["names"]]
  attr_use[["row.names"]] <- .set_row_names(vctrs:::df_length(x))
  attr_use[["class"]] <- unique(c(classes_preserve, attr_x[["class"]]))
  attributes(x) <- attr_use
  x
}

group_by.tbl_df_custom <- function (.data, ..., add = FALSE) {
  message("Custom method for `group_by`")
  retval <- vec_restore_inclusive(NextMethod(), .data)
  print(class(retval))
  retval
}

retval <- df_custom %>%
  group_by(id) %>%
  do(foo(.))
#> Custom method for `group_by`
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"
#> custom method for `do`
#> custom method for `ungroup`
#> custom method for `[`
#> Custom method for `foo`
#> Custom method for `summarise`
#> custom method for `[`
#> Custom method for `foo`
#> Custom method for `summarise`

retval
#> custom method for `[`
#> custom method for `ungroup`
#> # A tibble: 2 x 2
#> # Groups:   id [2]
#>   id        y
#>   <chr> <dbl>
#> 1 A       200
#> 2 B       500
retval %>% class()
#> [1] "tbl_df_custom" "grouped_df"    "tbl_df"        "tbl"          
#> [5] "data.frame"

Created on 2019-01-08 by the reprex package (v0.2.1)

Rappster
  • 12,762
  • 7
  • 71
  • 120