For the following example, where it involves a self conditional join and a subsequent groupby/aggregate operation. It turned out that in such case, DuckDB
gives much better performance than Polars
(~10x on a 32-core machine).
My questions are:
- What could be the potential reason(s) for the slowness (relative to
DuckDB
) ofPolars
? - Am I missing some other faster ways of doing the same thing in
Polars
?
import time
import duckdb
import numpy as np
import polars as pl
## example dataframe
rng = np.random.default_rng(1)
nrows = 5_000_000
df = pl.DataFrame(
dict(
id=rng.integers(1, 1_000, nrows),
id2=rng.integers(1, 10, nrows),
id3=rng.integers(1, 500, nrows),
value=rng.normal(0, 1, nrows),
)
)
## polars
start = time.perf_counter()
res = (
df.lazy()
.join(df.lazy(), on=["id", "id2"], how="left")
.filter(
(pl.col("id3") > pl.col("id3_right"))
& (pl.col("id3") - pl.col("id3_right") < 30)
)
.groupby(["id2", "id3", "id3_right"])
.agg(pl.corr("value", "value_right"))
.collect(streaming=True)
)
time.perf_counter() - start
# 120.93155245436355
## duckdb
start = time.perf_counter()
res2 = (
duckdb.sql(
"""
SELECT df.*, df2.id3 as id3_right, df2.value as value_right
FROM df JOIN df as df2
ON (df.id = df2.id
AND df.id2 = df2.id2
AND df.id3 > df2.id3
AND df.id3 - df2.id3 < 30)
"""
)
.aggregate(
"id2, id3, id3_right, corr(value, value_right) as value",
"id2, id3, id3_right",
)
.pl()
)
time.perf_counter() - start
# 18.472263277042657