0

I am using RSQLite, DBI, dbplyr, and sqldf packages.

Here are the packages:

library(dbplyr)
library(RSQLite)
library(DBI)
library(sqldf)
library(tidyverse)

First I'm using mtcars dataset with the rownames included.

mtcars <- tibble::as_tibble(mtcars, rownames = 'car')  

I also created my own table which I combined with a left_join.

mtcars %>% dplyr::left_join(cars_origin_tbl, by = 'car') -> mtcars

car <- c("Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", "Volvo 142E")

origin <- c("Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", "United States", "United States", "United States", "Italy", "Japan", "Japan", "Japan", "United States", "United States", "United States", "United States", "Italy", "Germany", "British", "United States", "Italy", "Italy", "Sweden")

cars_origin_tbl <- tibble(car, origin)

Now I establish the connection to SQLite. I create a table using dplyr methods and I see what that looks like as SQL.

sql_mtcars <- mtcars

con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)

tbl(con, "sql_mtcars") %>%
  dplyr::select(car, origin, mpg, disp, hp, drat, wt, qsec) %>%
  pivot_longer(names_to = 'names', values_to = 'values', 4:8) %>%
  group_by(names) %>%
  summarize(sd = sd(values)) %>%
  dplyr::show_query()

Here is what that table would look like if I used a dataframe and did not use show_query:

enter image description here

And here is what the SQL looks like from the show_query function:

enter image description here

I can copy and paste that query into either dbGetQuery or sqldf and in this case they both work the same.

DBI::dbGetQuery(con, "
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
  ")

sqldf("
SELECT `names`, STDEV(`values`) AS `sd`
FROM (SELECT `car`, `origin`, `mpg`, 'disp' AS `names`, `disp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'hp' AS `names`, `hp` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'drat' AS `names`, `drat` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'wt' AS `names`, `wt` AS `values`
FROM `sql_mtcars`
UNION ALL
SELECT `car`, `origin`, `mpg`, 'qsec' AS `names`, `qsec` AS `values`
FROM `sql_mtcars`)
GROUP BY `names`
      ")

Using dbGetQuery or sqldf gives me back the same table that I included a picture of, above.

Ok, but now I go on to a totally query with the same workflow.

con <- RSQLite::dbConnect(SQLite(), ":memory:")
dplyr::copy_to(con, sql_mtcars)

dbListTables(con)

tbl(con, "sql_mtcars") %>%
  group_by(origin) %>%
  summarize(mean = round(mean(mpg), 1), 
            sd = round(sd(mpg),1),
            skew = round(3 * ( ( mean(mpg) - median(mpg) )  / sd(mpg) ),1),
            kurtosis = round(sum((mpg - mean(mpg)) ^ 4) / 32 / (var(mpg) ^ (2)),1)) %>%
  arrange(desc(mean)) %>%
  dplyr::show_query()

Here is what that table would look like if I used a dataframe and did not use show_query:

enter image description here

In this case I can see the SQL from show_query()

enter image description here

But I can't make it compile with either dbGetQuery or sqldf.

DBI::dbGetQuery(con, "
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
  ")

sqldf("
SELECT `origin`, ROUND(AVG(`mpg`), 1) AS `mean`, ROUND(STDEV(`mpg`), 1) AS `sd`, ROUND(3.0 * ((AVG(`mpg`) - MEDIAN(`mpg`)) / STDEV(`mpg`)), 1) AS `skew`, ROUND(SUM(POWER((`mpg` - AVG(`mpg`)), 4.0)) / 32.0 / (POWER(VARIANCE(`mpg`), (2.0))), 1) AS `kurtosis`
FROM `sql_mtcars`
GROUP BY `origin`
ORDER BY `mean` DESC
      ")

In both cases, I get this error message:

Error: misuse of aggregate function AVG()

I'm not sure what the issue is. I think it might be one of three general problems:

  • Maybe the SQL query form show_query() incorrect in some way? I just copied and pasted what arrived in my console back into the function.
  • Maybe dbGetQuery and sqldf have bugs in them, and aren't perfect?
  • Maybe it has something to do with NA values in this second dataframe?
hachiko
  • 671
  • 7
  • 20
  • 1
    In the kurtosis expression the code is attempting to use an aggregate avg within another aggregate sum. That is not allowed. – G. Grothendieck Apr 18 '22 at 18:38
  • Also, if you look at the source code of kurtosis in PerformanceAnalytics R package there are R formulas for kurtosis. These are in R rather than SQL but when translated to SQL none of them would require nested aggregate functions. – G. Grothendieck Apr 20 '22 at 16:47

0 Answers0