A few options below with benchmarking. The best seems to be subsetting the 2-permutations of row indices where id
is different (f1
).
library(data.table)
library(microbenchmark)
# column-bind based on all index pairs where the id values are different
f1 <- function(dt) {
setnames(
CJ(1:nrow(dt), 1:nrow(dt))[dt$id[V1] != dt$id[V2], setDT(c(dt[V1], dt[V2]))],
c(colnames(dt), paste0(colnames(dt), "_new"))
)
}
# cartesian join, filter during the join
fsub <- function(dt, i) as.list(dt[id != i])
f2 <- function(dt) setnames(dt[, fsub(dt, id), id:x], c(colnames(dt), paste0(colnames(dt), "_new")))
# cartesian join, filter after the join
f3 <- function(dt) setnames(dt[, as.list(dt), id:x], c(colnames(dt), paste0(colnames(dt), "_new")))[id != id_new]
# combining two non-equi joins
f4 <- function(dt) {
rbindlist(
list(
dt[
dt,
.(id = x.id, x = x.x, id_new = i.id, x_new = i.x),
on = .(id < id),
allow.cartesian = TRUE,
nomatch = 0
],
dt[
dt,
.(id = x.id, x = x.x, id_new = i.id, x_new = i.x),
on = .(id > id),
allow.cartesian = TRUE,
nomatch = 0
]
)
)
}
# OP data
dt <- data.table(id = 1:3, x = 2:4)
ldt <- list(f1(dt), f2(dt), f3(dt), f3(dt), setorder(f4(dt), id, x, id_new))
identical(ldt[-1], ldt[-length(ldt)])
#> [1] TRUE
# a bigger dataset
dt <- data.table(id = rep(1:25, each = 4), x = 1:100)
ldt <- list(f1(dt), f2(dt), f3(dt), f3(dt), setorder(f4(dt), id, x, id_new))
identical(ldt[-1], ldt[-length(ldt)])
#> [1] TRUE
microbenchmark(f1 = f1(dt),
f2 = f2(dt),
f3 = f3(dt),
f4 = f4(dt))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 1.1789 1.34895 1.934972 1.56705 2.06485 12.7612 100
#> f2 8.4337 9.23755 10.398676 9.89295 10.69075 14.6312 100
#> f3 1.7287 1.96755 2.457754 2.29990 2.66350 6.1247 100
#> f4 2.1832 2.42430 2.788016 2.58395 2.83860 5.6919 100
# an even bigger one
dt <- data.table(id = rep(1:10, each = 100), x = 1:1000)
ldt <- list(f1(dt), f2(dt), f3(dt), f3(dt), setorder(f4(dt), id, x, id_new))
identical(ldt[-1], ldt[-length(ldt)])
#> [1] TRUE
microbenchmark(f1 = f1(dt),
f2 = f2(dt),
f3 = f3(dt),
f4 = f4(dt))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 14.3543 16.32115 20.59036 19.07030 22.01220 60.0190 100
#> f2 93.5400 100.53905 108.82218 104.81745 111.09415 145.3390 100
#> f3 22.0357 28.98800 36.93655 33.04950 36.53275 80.2040 100
#> f4 14.8955 18.53780 23.03147 21.26855 24.63430 56.4298 100