I'm facing a performance issue when I insert records in R Script into Teradata database via JDBC. It takes around 100 sec for 100.000 records, which is quite slow considering I have to insert around 10 million records.
Does someone any idea how I can improve the performance of the insert?
I setup the connection on the following way:
options(java.parameters = "-Xmx8048m")
options("encoding")
library(RJDBC)
.jinit()
# path to the JDBC driver:
path.jdbcdriver <- paste0(support_folder, 'jdbc/')
cat("\n JDBC driver Path:", path.jdbcdriver)
class.jdbcdriver <- paste0(path.jdbcdriver, c('terajdbc4.jar', 'tdgssconfig.jar'), collapse=';')
# on a unix-like OS the class path needs to be modified
.jaddClassPath(paste0(path.jdbcdriver, 'terajdbc4.jar'))
.jaddClassPath(paste0(path.jdbcdriver, 'tdgssconfig.jar'))
#.jclassPath() # check class path
# extra parameters for JDBC driver connection
params.jdbc <- 'FINALIZE_AUTO_CLOSE=ON,TMODE=DEFAULT, LOB_SUPPORT=OFF, CHARSET=UTF8, rewriteBatchedStatements=true'
# create driver object
driver <- JDBC(driverClass = 'com.teradata.jdbc.TeraDriver', classPath = class.jdbcdriver)
After that I'm using a prepared statement. As you see below I'm creating batches with 10.000 records and commit them. If I try to increase the batch size to for instance 50.000, it fails and it throws a memory error exception.
The inputs for the insert statement are coming from a data.table and I'm using 3 columns from this data.table: myinsert(df[n,1],df[n,11],as.numeric(df[n,7]))
In case of exceptions I print out only the first exception, because otherwise it would print out the exceptions of the whole batch which is 10.000 records. That's why I commented out the while statement in the exception handling.
target_db <- paste0("database ", database)
myinsert <-
function(arg1,arg11,arg7){
.jcall(ps,"V","setInt",as.integer(1),as.integer(arg1))
.jcall(ps,"V","setInt",as.integer(2),as.integer(arg11))
.jcall(ps,"V","setDouble",as.integer(3),arg7)
.jcall(ps,"V","addBatch")
}
#conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc,', ','TYPE=FASTLOAD'), td.uid, td.pwd)
conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc), td.uid, td.pwd)
##prepare
insert_stmt <- paste0("insert into ",database, ".table_name_x values(?,","2",",","?",",","2",",",reporting_year,",",knowledge_year,",",bag_view_id,",","-11",",","?",",",delivery_type_id,",","-11",",",version_id,")")
#,?,?,?,?,?,?,?,?
ps = .jcall(conn@jc,"Ljava/sql/PreparedStatement;","prepareStatement",insert_stmt)
#apply & commit
batchsize = 100
cat("\n# of records to be loaded: ", nrow(df))
cat("\n* batch creation is running ...\n")
if (nrow(df) >= batchsize) {
for(n in 1:nrow(df)) {
myinsert(df[n,1],df[n,11],as.numeric(df[n,7]))
if (((n %% 10000) == 0) || (n == nrow(df))) {
if (((n %% 100000) == 0) || (n == nrow(df))) {
#cat("\n")
print(paste0("Execute Batch ", n, "; Time: ", Sys.time()))
}
.jcall(ps,"[I","executeBatch", check=FALSE)
ex = .jgetEx() # save exceptions from PreparedStatement.executeBatch()
.jclear() # clear all pending exceptions
if (!is.jnull(ex)) {
#while (!is.jnull(ex)) { # loop thru chained exceptions
sw = .jnew("java/io/StringWriter")
pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
.jcall(ex,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
if (ex %instanceof% "java.sql.BatchUpdateException") {
print(.jcall(ex,"[I","getUpdateCounts")) # print int[] update count showing 3 rows inserted successfully (1) and 2 rows failed to insert (-3)
}
cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the error message and stack trace
if (ex %instanceof% "java.sql.SQLException") {
ex = ex$getNextException()
} else {
ex = ex$getCause()
}
#}
# capture chained JDBC SQLWarning messages and stack trace from Connection.rollback()
.jcall(conn, "V", "rollback")
w = .jcall(conn, "Ljava/sql/SQLWarning;", "getWarnings") # save warnings from Connection.rollback()
while (!is.jnull(w)) { # loop thru chained warnings
sw = .jnew("java/io/StringWriter")
pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
.jcall(w,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the warning message and stack trace
w = w$getNextWarning()
}
}
else {
.jcall(conn@jc, "V", "commit")
#cat("\ncommiting...")
}
}
}
#})
}
dbCommit(conn)
.jcall(ps,"V","close")
cat("\nDatabase commit finished"
I used the parameter rewriteBatchedStatements=true, but actually it does not help because there is no such a JDBC parameter in the Teradata JDBC.
I cannot use parameter FASTLOAD, because the table is not empty