Why another benchmark?
- Frank mentioned in his comment that there might be a speed up by switching from
.(rn, gear)
to c("rn", "gear")
but didn't benchmark that separately.
- In R yoda's benchmark, the sample data are of type integer but
LIMIT <- 500
is of type double. data.table
occasionally warns about type conversions so I wonder what effect on performance the type conversion may have in this case.
What to benchmark?
Up to now, 3 answers have been provided which make up five code variants:
Unfortunately, I couldn't get the row.filter to work in a SE version.
Which parameters are used?
- Problem size (number of rows): 102, 103, ..., 108
- Different values for
LIMIT
: 100, 500, 900
- Type of
LIMIT
: integer, double to test the effect of type conversion
The number of repetitions is computed from the problem size with a minimum of 3 runs and a maximum of 100 runs.
Results
Type conversion does cost about 4% (median) to 9% (mean) of performance. So it does matter if you write LIMIT <- 500
or LIMIT <- 500L
using L
to indicate an integer constant.

The performance penalty for using non-standard evaluation is much higher: NSE needs on average more than 50% more time than SE - for both approaches.
(Note that the chart below only shows results for type integer)
The chart below for limit 500 and type integer indicates that the SE variants are faster than their NSE counterparts for all problem sizes. Interestingly, chaining_se seems to have a slight advantage over which_se for smaller problem sizes up to 5000 rows while for problem sizes above 5 M rows which_se is clearly faster.

By request, here is a table showing the timings in ms for above chart:
dcast(bm_med[limit == 500L & type == "int"][
, expr := forcats::fct_reorder(factor(expr), -time)],
expr ~ n_rows, fun.aggregate = function(x) max(x/1E6), value.var = "time")
expr 100 1000 10000 1e+05 1e+06 1e+07 1e+08
1: chaining_nse 0.8189745 0.8493695 1.0115405 2.870750 22.34469 441.1621 2671.179
2: row.filter 0.7693225 0.7972635 0.9622665 2.677807 21.30861 247.3984 2677.495
3: which_nse 0.8486145 0.8690035 1.0117295 2.620980 18.39406 219.0794 2341.990
4: chaining_se 0.5299360 0.5582545 0.6454755 1.700626 12.48982 166.0164 2049.904
5: which_se 0.5894045 0.6114935 0.7040005 1.624166 13.00125 130.0718 1289.050
Benchmark code
library(data.table)
library(microbenchmark)
run_bm <- function(n_rows, limit = 500L, type = "int") {
set.seed(1234L)
DT <- data.table(x = sample(1000, n_rows, replace = TRUE),
y = sample(1000, n_rows, replace = TRUE))
LIMIT <- switch(type,
int = as.integer(limit),
dbl = as.double(limit))
times <- round(scales::squish(sqrt(1E8 / n_rows) , c(3L, 100L)))
cat("Start run:", n_rows, limit, type, times, "\n")
microbenchmark(row.filter = {
row.numbers <- DT[, .I[x > LIMIT]]
DT[row.numbers, .(row.numbers, x, y)]
},
chaining_nse = {
DT[, row.number := .I][x > LIMIT, .(row.number, x, y)]
},
chaining_se = {
DT[, row.number := .I][x > LIMIT, c("row.number", "x", "y")]
},
which_nse = {
row.numbers <- DT[x > LIMIT, which = TRUE ]
DT[row.numbers, .(x, y)][, row.numbers := row.numbers ][]
},
which_se = {
row.numbers <- DT[x > LIMIT, which = TRUE ]
DT[row.numbers, c("x", "y")][, row.numbers := row.numbers][]
},
times = times)
}
# parameter
bm_par <- CJ(n_rows = 10^seq(2L, 8L, 1L),
limit = seq(100L, 900L, 400L),
type = c("int", "dbl"))
# run the benchmarks
bm_raw <- bm_par[, run_bm(n_rows, limit, type), by = .(n_rows, limit, type)]
# aggregate results
bm_med <- bm_raw[, .(time = median(time)), by = .(n_rows, limit, type, expr)]
Graphics code
library(ggplot2)
# chart 1
ggplot(
dcast(bm_med, n_rows + limit + expr ~ type, value.var = "time")[
, ratio := dbl / int - 1.0] #[limit == 500L]
) +
aes(n_rows, ratio, colour = expr) +
geom_point() +
geom_line() +
facet_grid(limit ~ expr) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_continuous(labels = scales::percent) +
coord_cartesian(ylim = c(-0.1, 0.5)) +
geom_hline(yintercept = 0) +
theme_bw() +
ggtitle("Performance loss due to type conversion") +
ylab("Relative computing time dbl vs int") +
xlab("Number of rows (log scale)")
ggsave("p2.png")
# chart 2
ggplot(
dcast(bm_med[, c("code", "eval") := tstrsplit(expr, "_")][!is.na(eval)],
n_rows + limit + type + code ~ eval, value.var = "time")[
, ratio := nse / se - 1.0][type == "int"]
) +
aes(n_rows, ratio, colour = code) +
geom_point() +
geom_line() +
facet_grid(limit + type ~ code) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = 0) +
theme_bw() +
ggtitle("Performance loss due to non standard evaluation") +
ylab("Relative computing time NSE vs SE") +
xlab("Number of rows (log scale)")
ggsave("p3.png")
# chart 3
ggplot(bm_med[limit == 500L][type == "int"]) +
aes(n_rows, time/1E6, colour = expr) +
geom_point() +
geom_smooth(se = FALSE) +
facet_grid(limit ~ type) +
facet_grid(type ~ limit) +
scale_x_log10(labels = function(x) scales::math_format()(log10(x))) +
scale_y_log10(labels = function(x) scales::math_format()(log10(x))) +
theme_bw() +
ggtitle("Benchmark results (log-log scale)") +
ylab("Computing time in ms (log scale)") +
xlab("Number of rows (log scale)")
ggsave("p1.png")