Depending on the dimensionality of your problem, one method is to create a permutation of centers with each row and then find the minimum for each row
library(data.table)
library(microbenchmark)
#create data
DT <- setDT(copy(iris))
set.seed(0L)
cols <- grep("^Sepal", names(DT), value=TRUE)
centers <- DT[, lapply(.SD, mean),
by=.("cluster"=sample(0:1, nrow(iris), replace=TRUE)),
.SDcols=cols]
setnames(centers, cols, paste0("mean_", cols))
CJ.dft <- function(...) {
Reduce(f=function(x, y) cbind(x[rep(1:nrow(x), times=nrow(y)),], y[rep(1:nrow(y), each=nrow(x)),]),
x=list(...)[-1],
init=..1)
} #CJ.dft
crossJoinMtd <- function() {
#cross join data with centers
ans <- CJ.dft(DT[, rn:=.I], centers)
#find the closest cluster
ans[,
.(ClosestCluster=cluster[which.min((Sepal.Length - mean_Sepal.Length)^2 + (Sepal.Width - mean_Sepal.Width)^2)]),
by=.(rn)]
}
Sample usage:
crossJoinMtd()
# rn ClosestCluster
# 1: 1 1
# 2: 2 0
# 3: 3 0
# 4: 4 0
# 5: 5 1
# ---
#146: 146 1
#147: 147 0
#148: 148 1
#149: 149 1
#150: 150 0
some timings:
microbenchmark(crossJoinMtd(),
times=100L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# crossJoinMtd() 2.7325 3.03085 3.558447 3.26885 3.58805 14.6075 100
If OP can provide more details on the number of clusters or dimensionalities, it probably can be further optimized.