Alternatively with tydiverse, use readr::read_csv_chunked
and readr::write_csv
, like:
require(tidyverse)
inputfile = "./commodity_trade_statistics_data.csv"
chunck.size = 100 * 1000 * 5
proc.chunk = function(df, pos) {
df %>%
mutate(row_idx=seq(pos, length.out = df %>% nrow)) %>% # optional, preserve row number within original csv
write_csv(
paste0("chunk_", floor(pos / chunck.size), ".csv")
)
}
read_csv_chunked(
inputfile,
callback=SideEffectChunkCallback$new(proc.chunk),
chunk_size = chunck.size,
progress = T # optional, show a progress bar
)
Within readr::read_csv_chunked
, one can also set parameter local
with readr::locale
, which allows to define format of datetime and timezone.
To "lazify" the estimation of chunck.size
, I wrote a crude function based on simple statistics:
calc.chunk.size = function(
file,
limit.mb.per.file = 100, # limit MB per file
floor.to.nrow=10000, # floor to nrows per file for easier numbers
size.estimate.sigma = 5, # assume row.size normally distr. and first n row representative
# 3-sigma left 0.1% chance of having file exceeding limit size
# 5-sigma is, closer to Six Sigma :)
sample.first.nrow = 100L
) {
# requires package "R.utils"
tot.nrow = R.utils::countLines(file)
# alternatively if on POSIX machine (Linux or macOS)
# tot.nrow = system("wc -l ./commodity_trade_statistics_data.csv", intern=T) %>%
# str_extract("\\d+") %>%
# as.numeric()
bytes.per.row = read_lines(file,
skip=1, # skip header, or the 1st row anyway ...
n_max=sample.first.nrow # take sample of at most first N row
) %>%
nchar(type="bytes") %>% { list(mean=mean(.), sd=sd(.)) }
est.bytes.per.row = bytes.per.row$mean + size.estimate.sigma * bytes.per.row$sd
est.chunck.size = limit.mb.per.file * 1000 * 1000 / est.bytes.per.row
est.chunck.size = max(c(1, floor(est.chunck.size / floor.to.nrow))) * floor.to.nrow
est.chunck.size
}
chunck.size = calc.chunk.size(inputfile)
# chunk.size = 540000 for this particular file