For the following toy example which involves both calculations over
window and groupby
aggregations, DuckDB
performs nearly 3x slower than Polars
in Python
. Both give exactly the same results.
Is this kind of benchmarking result as expected, because DuckDB
is designed and should be used more for cross-dataframe/table operations?
Or, is it just because the inefficiency comes from the way my SQL query is written?
import time
import duckdb
import numpy as np
import polars as pl
## example dataframe
rng = np.random.default_rng(1)
nrows = 10_000_000
df = pl.DataFrame(
dict(
id=rng.integers(1, 100, nrows),
id2=rng.integers(1, 1_000, nrows),
v1=rng.normal(0, 1, nrows),
v2=rng.normal(0, 1, nrows),
v3=rng.normal(0, 1, nrows),
v4=rng.normal(0, 1, nrows),
)
)
## polars
start = time.perf_counter()
res = (
df.select(
[
"id",
"id2",
pl.col("v1") - pl.col("v1").mean().over(["id", "id2"]),
pl.col("v2") - pl.col("v2").mean().over(["id", "id2"]),
pl.col("v3") - pl.col("v3").mean().over(["id", "id2"]),
pl.col("v4") - pl.col("v4").mean().over(["id", "id2"]),
]
)
.groupby(["id", "id2"])
.agg(
[
(pl.col("v1") * pl.col("v2")).sum().alias("ans1"),
(pl.col("v3") * pl.col("v4")).sum().alias("ans2"),
]
)
)
time.perf_counter() - start
# 1.0977217499166727
## duckdb
start = time.perf_counter()
res2 = (
duckdb.sql(
"""
SELECT id, id2,
v1 - mean(v1) OVER (PARTITION BY id, id2) as v1,
v2 - mean(v2) OVER (PARTITION BY id, id2) as v2,
v3 - mean(v3) OVER (PARTITION BY id, id2) as v3,
v4 - mean(v4) OVER (PARTITION BY id, id2) as v4,
FROM df
"""
)
.aggregate(
"id, id2, sum(v1 * v2) as ans1, sum(v3 * v4) as ans2",
"id, id2",
)
.pl()
)
time.perf_counter() - start
# 3.549897135235369