0

I'm facing a performance issue when I insert records in R Script into Teradata database via JDBC. It takes around 100 sec for 100.000 records, which is quite slow considering I have to insert around 10 million records.

Does someone any idea how I can improve the performance of the insert?

I setup the connection on the following way:

    options(java.parameters = "-Xmx8048m")
    options("encoding")
     library(RJDBC)
     .jinit()
     # path to the JDBC driver:
     path.jdbcdriver <- paste0(support_folder, 'jdbc/')
     cat("\n JDBC driver Path:", path.jdbcdriver)
    class.jdbcdriver <- paste0(path.jdbcdriver, c('terajdbc4.jar', 'tdgssconfig.jar'), collapse=';')
     # on a unix-like OS the class path needs to be modified
    .jaddClassPath(paste0(path.jdbcdriver, 'terajdbc4.jar'))
    .jaddClassPath(paste0(path.jdbcdriver, 'tdgssconfig.jar'))
    #.jclassPath() # check class path

    # extra parameters for JDBC driver connection
params.jdbc <- 'FINALIZE_AUTO_CLOSE=ON,TMODE=DEFAULT, LOB_SUPPORT=OFF, CHARSET=UTF8, rewriteBatchedStatements=true'

     # create driver object
     driver <- JDBC(driverClass = 'com.teradata.jdbc.TeraDriver', classPath = class.jdbcdriver)

After that I'm using a prepared statement. As you see below I'm creating batches with 10.000 records and commit them. If I try to increase the batch size to for instance 50.000, it fails and it throws a memory error exception.

The inputs for the insert statement are coming from a data.table and I'm using 3 columns from this data.table: myinsert(df[n,1],df[n,11],as.numeric(df[n,7]))

In case of exceptions I print out only the first exception, because otherwise it would print out the exceptions of the whole batch which is 10.000 records. That's why I commented out the while statement in the exception handling.

  target_db <- paste0("database ", database)

  myinsert <-
    function(arg1,arg11,arg7){
    .jcall(ps,"V","setInt",as.integer(1),as.integer(arg1))
    .jcall(ps,"V","setInt",as.integer(2),as.integer(arg11))
    .jcall(ps,"V","setDouble",as.integer(3),arg7)
    .jcall(ps,"V","addBatch")
  }

  #conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc,', ','TYPE=FASTLOAD'), td.uid, td.pwd)
  conn <- dbConnect(driver, paste0('jdbc:teradata://', td.ip, '/', params.jdbc), td.uid, td.pwd)

  ##prepare
  insert_stmt <- paste0("insert into ",database, ".table_name_x values(?,","2",",","?",",","2",",",reporting_year,",",knowledge_year,",",bag_view_id,",","-11",",","?",",",delivery_type_id,",","-11",",",version_id,")")
  #,?,?,?,?,?,?,?,?
  ps = .jcall(conn@jc,"Ljava/sql/PreparedStatement;","prepareStatement",insert_stmt)

  #apply & commit
  batchsize = 100
  cat("\n# of records to be loaded: ", nrow(df))
  cat("\n* batch creation is running ...\n")
  if (nrow(df) >= batchsize) {

    for(n in 1:nrow(df)) {

      myinsert(df[n,1],df[n,11],as.numeric(df[n,7]))

      if (((n %% 10000) == 0) || (n == nrow(df))) {

        if (((n %% 100000) == 0) || (n == nrow(df))) {
           #cat("\n")
           print(paste0("Execute Batch ", n, "; Time: ", Sys.time()))
        }
        .jcall(ps,"[I","executeBatch", check=FALSE)

        ex = .jgetEx() # save exceptions from PreparedStatement.executeBatch()
        .jclear() # clear all pending exceptions
        if (!is.jnull(ex)) {
          #while (!is.jnull(ex)) { # loop thru chained exceptions
            sw = .jnew("java/io/StringWriter")
            pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
            .jcall(ex,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
            if (ex %instanceof% "java.sql.BatchUpdateException") {
              print(.jcall(ex,"[I","getUpdateCounts")) # print int[] update count showing 3 rows inserted successfully (1) and 2 rows failed to insert (-3)
            }
            cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the error message and stack trace
            if (ex %instanceof% "java.sql.SQLException") {
              ex = ex$getNextException()
            } else {
              ex = ex$getCause()
            }
          #}

          # capture chained JDBC SQLWarning messages and stack trace from Connection.rollback()
          .jcall(conn, "V", "rollback")
          w = .jcall(conn, "Ljava/sql/SQLWarning;", "getWarnings") # save warnings from Connection.rollback()
          while (!is.jnull(w)) { # loop thru chained warnings
            sw = .jnew("java/io/StringWriter")
            pw = .jnew("java/io/PrintWriter",.jcast(sw, "java/io/Writer"),TRUE)
            .jcall(w,"V","printStackTrace",pw) # redirect printStackTrace to a Java PrintWriter so it can be printed in Rterm AND Rgui
            cat(.jcall(sw,"Ljava/lang/String;","toString")) # print the warning message and stack trace
            w = w$getNextWarning()
          }
        }
          else {
          .jcall(conn@jc, "V", "commit")
          #cat("\ncommiting...")
        }
      }

    }
    #})
  }
  dbCommit(conn)
  .jcall(ps,"V","close")
  cat("\nDatabase commit finished"

I used the parameter rewriteBatchedStatements=true, but actually it does not help because there is no such a JDBC parameter in the Teradata JDBC.

I cannot use parameter FASTLOAD, because the table is not empty

szend
  • 83
  • 8
  • 1 row/sec is veeeerrrry slow, batched inserts should be thousands/sec. Did you compare performance when loading outside of R, e.g. Studio? You might fastload into an empty staging table and then Insert/Select. – dnoeth Feb 14 '18 at 09:40
  • sorry, I was wrong. It takes around 100 sec for 100.000 records – szend Feb 14 '18 at 09:47
  • use bteq? https://www.tutorialspoint.com/teradata/teradata_bteq.htm. apparently thats the bulk insert equivalent in teradata. relevant qn on SO: https://stackoverflow.com/questions/20021900/loading-files-into-teradata – chinsoon12 Feb 14 '18 at 09:48
  • I would like to avoid storing files and trigger bteq. It would require another jobs which take care of the triggering of the bteq. If it is possible at first I just try to improve the JDBC performance if it is possible. – szend Feb 14 '18 at 09:55

0 Answers0