First a quick example of what I'm seeing, then some context as to why I'm doing what I'm doing.
dt = data.table(i=rep(1:3, each=4), t=rep(1:4, times=3), x=runif(12))
dt[, .(sum=sum(x), cnt=.N), keyby=.(i)] # works as expected
# i sum cnt
# 1: 1 2.932400 4
# 2: 2 1.483940 4
# 3: 3 2.113194 4
dt[, .(sum=sum(x), cnt=.N), keyby=list(i)] # same as above
# let j and keyby be specified by user, optionally NULL
j_str = parse(text=".(sum=sum(x), cnt=.N)")
by_str = parse(text="keyby=.(i)")
dt[, eval(j_str), eval(by_str)] # could not find function .
# Error in .(i) : could not find function "."
by_str = parse(text="keyby=list(i)")
dt[, eval(j_str), eval(by_str)] # correct results, but not correct column names
# keyby sum cnt
# 1: 1 2.932400 4
# 2: 2 1.483940 4
# 3: 3 2.113194 4
Notice two problems, I'm mostly concerned about the second (wrong column names).
What I would in particular prefer to do is just pass in one string that gets evaluated inside of the data.table[], but I couldn't get that to work, only i, j, etc. separately
Why am I doing this, well, the simplified version is that I'm writing a function that does this evaluation.
stupidfnc = function(dt, j_str, by_str) {
return(dt[, eval(j_str), eval(by_str)])
}
The longer answer is that I want to loop over files, aggregate, rbind, and then aggregate again. However, the full list of aggregated data is too large to fit in memory. Thus, I'm doing a little bit of looping, rbinding, aggregating, little more looping, rbinding, aggregating, then aggregating the aggregates, then looping ..... I have a function that allows me to write a function to do this in a flexible manner without having to rewrite the loop every time. I've been doing this doing this quite a bit and working with the various loop levels is a higher cognitive burden than it really should be. So I was hoping a function like this would be useful.
That function is below.
#' find z the maximum integer divisor of x st z <= sqrt(x)
#'
#' you can find y = x / z easily enough
#' useful for rbind'ing in chunks w/ a merge or collapse
integer_approx_sqrt = function(x) {
upper = floor(sqrt(x))
for (cand in upper:1) {
if ((x %% cand) == 0) {
break
}
}
return(cand)
}
#' loop over l, apply FUN, and aggregate with j_aggr by by_agg
#'
#' todo
mclapply_rbind_aggr = function(l, FUN, j_aggr, by_aggr, mc.cores=1,
mc.preschedule=F, chunksize=0, ...) {
if (chunksize == 0) {
chunksize = integer_approx_sqrt(length(l))
}
if (length(l) <= chunksize | chunksize == 1) {
dtl = mclapply(l, FUN=FUN, mc.cores=mc.cores,
mc.preschedule=mc.preschedule, ...)
} else {
dtl = lapply(splitIndices(length(l), chunksize),
function(indcs) {
rbindlist(mclapply(indcs, FUN=FUN, mc.cores=mc.cores,
mc.preschedule=mc.preschedule, ...))[,
eval(parse(text=j_aggr)), eval(parse(text=by_aggr))]})
}
return(rbindlist(dtl)[, eval(parse(text=j_aggr)), eval(parse(text=by_aggr))])
}