1

I would like to able to do something equivalent to this using dbplyr.

library(magrittr)
library(tidyverse)
library(bigrquery)
library(dbplyr)
#> 
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#> 
#>     ident, sql

bq_deauth()
bq_auth()

bq_conn = dbConnect(
  bigquery(),
  project = "eacri-genomics"
)


df = tibble(
  chr =   c(1,1,1,2,2,3),
  start = c(0, 10, 12, 0, 5, 1),
  end =   c(2, 11, 15, 1, 8, 3)
)
df
#> # A tibble: 6 x 3
#>     chr start   end
#>   <dbl> <dbl> <dbl>
#> 1     1     0     2
#> 2     1    10    11
#> 3     1    12    15
#> 4     2     0     1
#> 5     2     5     8
#> 6     3     1     3

df %>% 
  rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise: 
#>     chr start   end range    
#>   <dbl> <dbl> <dbl> <list>   
#> 1     1     0     2 <int [3]>
#> 2     1    10    11 <int [2]>
#> 3     1    12    15 <int [4]>
#> 4     2     0     1 <int [2]>
#> 5     2     5     8 <int [4]>
#> 6     3     1     3 <int [3]>

df %>% 
  rowwise() %>% mutate(range = list(seq(start, end))) %>% 
  unnest(range)
#> # A tibble: 18 x 4
#>      chr start   end range
#>    <dbl> <dbl> <dbl> <int>
#>  1     1     0     2     0
#>  2     1     0     2     1
#>  3     1     0     2     2
#>  4     1    10    11    10
#>  5     1    10    11    11
#>  6     1    12    15    12
#>  7     1    12    15    13
#>  8     1    12    15    14
#>  9     1    12    15    15
#> 10     2     0     1     0
#> 11     2     0     1     1
#> 12     2     5     8     5
#> 13     2     5     8     6
#> 14     2     5     8     7
#> 15     2     5     8     8
#> 16     3     1     3     1
#> 17     3     1     3     2
#> 18     3     1     3     3



dbWriteTable(
  bq_conn,
  name = "balter.range_test",
  value = df,
  overwrite = T
)

df_bq = tbl(bq_conn, "balter.range_test")

df_bq %>% 
  rowwise() %>% mutate(range = list(seq(start, end)))
#> Error in UseMethod("rowwise"): no applicable method for 'rowwise' applied to an object of class "c('tbl_BigQueryConnection', 'tbl_dbi', 'tbl_sql', 'tbl_lazy', 'tbl')"

df_bq %>% 
  mutate(range = generate_series(start, end))
#> Error: Job 'eacri-genomics.job_dnmwBmRtY9j0heYY1AtmwEtgblGp.US' failed
#> x Function not found: generate_series at [1:31] [invalidQuery]

Created on 2021-02-18 by the reprex package (v1.0.0)

abalter
  • 9,663
  • 17
  • 90
  • 145

2 Answers2

1

I found a function that is NOT canonical postgresql, but does work in BigQuery: generate_array.

library(magrittr)
library(tidyverse)
library(dbplyr)
#> 
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#> 
#>     ident, sql
library(bigrquery)
library(DBI)
library(RPostgres)
library(RPostgreSQL)

bq_deauth()
bq_auth(email="ariel.balter@gmail.com")

bq_conn = dbConnect(
  bigquery(),
  project = "elite-magpie-257717",
  dataset = "test_dataset"
)

df = tibble(
  chr =   c(1,1,1,2,2,3),
  start = c(0, 10, 12, 0, 5, 1),
  end =   c(2, 11, 15, 1, 8, 3)
)

df %>%
  rowwise() %>% mutate(range = list(seq(start, end)))
#> # A tibble: 6 x 4
#> # Rowwise: 
#>     chr start   end range    
#>   <dbl> <dbl> <dbl> <list>   
#> 1     1     0     2 <int [3]>
#> 2     1    10    11 <int [2]>
#> 3     1    12    15 <int [4]>
#> 4     2     0     1 <int [2]>
#> 5     2     5     8 <int [4]>
#> 6     3     1     3 <int [3]>

df %>%
  rowwise() %>% mutate(range = list(seq(start, end))) %>%
  unnest(range)
#> # A tibble: 18 x 4
#>      chr start   end range
#>    <dbl> <dbl> <dbl> <int>
#>  1     1     0     2     0
#>  2     1     0     2     1
#>  3     1     0     2     2
#>  4     1    10    11    10
#>  5     1    10    11    11
#>  6     1    12    15    12
#>  7     1    12    15    13
#>  8     1    12    15    14
#>  9     1    12    15    15
#> 10     2     0     1     0
#> 11     2     0     1     1
#> 12     2     5     8     5
#> 13     2     5     8     6
#> 14     2     5     8     7
#> 15     2     5     8     8
#> 16     3     1     3     1
#> 17     3     1     3     2
#> 18     3     1     3     3

dbWriteTable(
  bq_conn,
  name = "test_dataset.range_test",
  value = df,
  overwrite = T
)
#> Auto-refreshing stale OAuth token.

df_bq = tbl(bq_conn, "test_dataset.range_test")

df_bq %>%
  mutate(range = generate_array(start, end, 1))
#> # Source:   lazy query [?? x 4]
#> # Database: BigQueryConnection
#>     end start   chr range    
#>   <int> <int> <int> <list>   
#> 1     2     0     1 <dbl [3]>
#> 2    11    10     1 <dbl [2]>
#> 3    15    12     1 <dbl [4]>
#> 4     1     0     2 <dbl [2]>
#> 5     8     5     2 <dbl [4]>
#> 6     3     1     3 <dbl [3]>

Created on 2021-02-19 by the reprex package (v1.0.0)

abalter
  • 9,663
  • 17
  • 90
  • 145
0

Before attempting to do this with dbplyr it is worth first considering whether the database you are using supports having columns of type list/array. This is required for your range <list> column.

I suspect that (1) this feature is not common/widely supported in many databases, and (2) dbplyr does not currently provide straightforward translation where it is. (For example, see these two questions: one and two).

But as your sequence is just a number range you could accomplish the same thing via a join:

df = tibble(
  chr =   c(1,1,1,2,2,3),
  start = c(0, 10, 12, 0, 5, 1),
  end =   c(2, 11, 15, 1, 8, 3)
)

whole_range = tibble(range = -100:100)
# need start < min(df$start) and end > max(df$end)

dbWriteTable(conn, name = "df", value = df, overwrite = T)
dbWriteTable(conn, name = "whole_range", value = whole_range, overwrite = T)

remote_df = tbl(conn, "df")
remote_whole_range = tbl(conn, "whole_range")

# create dummy columns to join
remote_df = remote_df %>% mutate(ones = 1)
remote_whole_range = remote_whole_range %>% mutate(ones = 1)
# join and filter
remote_output = inner_join(remote_df, remote_whole_range, by = "ones") %>%
  filter(start <= range,
         range <= end)
Simon.S.A.
  • 6,240
  • 7
  • 22
  • 41