1

I am trying to save the Dataframe to HDFS location. But my save is taking a long time. The action before this is joining two tables using Spark SQL. Need to know why the save is having four stages and how to improve the performance. I have attached the stage list here Spark UI Image of the job.

I have also attached the snippets of my code.

Spark code:

This function gets the data from the main class and the models variable gets table info data from a XML. Initially it gets the data for the source table and then tries to retrieve the data from the other join tables.

    def sourceGen(spark: SparkSession,
                minBatchLdNbr: Int,
                maxBatchLdNbr: Int,
                batchLdNbrList: String,
                models: (GModel, TModel, NModel)): Unit = {
    val configJson = models._3
    val gblJson = models._1
    println("Source Loading started")
    val sourceColumns = configJson.transformationJob.sourceDetails.sourceSchema
    val query = new StringBuilder("select ")
    sourceColumns.map { SrcColumn =>
      if (SrcColumn.isKey == "nak") {
        query.append(
          "cast(" + SrcColumn.columnExpression + " as " + SrcColumn.columnDataType + ") as " + SrcColumn.columnName + ",")
      }
    }
    var tableQuery: String =
      if (!configJson.transformationJob.sourceDetails.sourceTableSchemaName.isEmpty) {
        if (!batchLdNbrList.trim.isEmpty)
          query.dropRight(1) + " from " + configJson.transformationJob.sourceDetails.sourceTableSchemaName + "." + configJson.transformationJob.sourceDetails.sourceTableName + " where batch_ld_nbr > " + minBatchLdNbr + " and batch_ld_nbr <= " + maxBatchLdNbr + "or batch_ld_nbr in ( " + batchLdNbrList + " )"
        else
          query.dropRight(1) + " from " + configJson.transformationJob.sourceDetails.sourceTableSchemaName + "." + configJson.transformationJob.sourceDetails.sourceTableName + " where batch_ld_nbr > " + minBatchLdNbr + " and batch_ld_nbr <= " + maxBatchLdNbr
      } else {
        if (!batchLdNbrList.trim.isEmpty)
          query.dropRight(1) + " from " + gblJson.gParams.sourceTableSchemaName + "." + configJson.transformationJob.sourceDetails.sourceTableName + " where batch_ld_nbr > " + minBatchLdNbr + " and batch_ld_nbr <= " + maxBatchLdNbr + "or batch_ld_nbr in ( " + batchLdNbrList + " )"
        else
          query.dropRight(1) + " from " + gblJson.gParams.sourceTableSchemaName + "." + configJson.transformationJob.sourceDetails.sourceTableName + " where batch_ld_nbr > " + minBatchLdNbr + " and batch_ld_nbr <= " + maxBatchLdNbr
      }
    if (minBatchLdNbr == 0 && maxBatchLdNbr == 0) {
      tableQuery = tableQuery.split("where")(0)
    }
    println("Time"+LocalDateTime.now());
    val tableQueryDf: DataFrame = spark.sql(tableQuery)
    println("tableQueryDf"+tableQueryDf);
    println("Time"+LocalDateTime.now());
    println("Source Loading ended")
    println("Parent Loading Started")
    val parentColumns = configJson.transformationJob.sourceDetails.parentTables
    val parentSourceJoinDF: DataFrame = if (!parentColumns.isEmpty) {
      parentChildJoin(tableQueryDf,
                      parentColumns,
                      spark,
                      gblJson.gParams.pSchemaName)
    } else {
      tableQueryDf
    }
    println("tableQueryDf"+tableQueryDf);
    println("Parent Loading ended")
    println("Key Column Generation Started")
    println("Time"+LocalDateTime.now());
    val arrOfCustomExprs = sourceColumns
      .filter(_.isKey.toString != "nak")
      .map(
        f =>
          functions
            .expr(f.columnExpression)
            .as(f.columnName)
            .cast(f.columnDataType))
    val colWithExpr = parentSourceJoinDF.columns.map(f =>
      parentSourceJoinDF.col(f)) ++ arrOfCustomExprs
    val finalQueryDF = parentSourceJoinDF.select(colWithExpr: _*)
    println("finalQueryDF"+finalQueryDF);
    println("Time"+LocalDateTime.now());
    keyGenUtils.writeParquetTemp(
      finalQueryDF,
      configJson.transformationJob.globalParams.hdfsInterimPath + configJson.transformationJob.sourceDetails.sourceTableName + "/temp_" + configJson.transformationJob.sourceDetails.sourceTableName
    )
    println("PrintedTime"+LocalDateTime.now());
    println("Key Column Generation Ended")
  }

The below code is used to retrieve the data from joining tables.

private def parentChildJoin(tableQueryDf: DataFrame,
                              ptJoin: Array[ParentTables],
                              sparkSession: SparkSession,
                              gParentSchema: String): DataFrame = {
    if (ptJoin.isEmpty) {
      tableQueryDf
    } else {
      val parentJoin = ptJoin.head
      val columns = new StringBuilder("select ")
      for (ptCols <- parentJoin.columns) {
        columns.append(
          ptCols.columnExpression + " as " + ptCols.columnName + ",")
      }
      val statement = columns.dropRight(1)

      if (!parentJoin.pSchemaName.isEmpty) {
        statement.append(
          " from " + parentJoin.pSchemaName + "." + parentJoin.pTableName)
      } else {
        statement.append(" from " + gParentSchema + "." + parentJoin.pTableName)
      }
      println("Time"+LocalDateTime.now());
      println("parentJoin.pTableName"+parentJoin.pTableName);
      val pQueryDF =
        if (parentJoin.pTableName.equalsIgnoreCase("order_summary_si_fact_t")) {
          val ordCalDt = "ord_cal_dt"
          val distinctDates = tableQueryDf
            .selectExpr(ordCalDt)
            .distinct
            .collect
            .map(_.getAs[String](0))
          sparkSession.sql(statement.toString).where(col(ordCalDt).isin(distinctDates: _*)).distinct
        } else {
          sparkSession.sql(statement.toString).distinct
        }
      println("Time"+LocalDateTime.now());
      //val pQueryDF = sparkSession.sql(statement.toString).distinct
      println("statement-"+parentJoin.pTableName+"-"+statement);
      parentChildJoin(
        tableQueryDf.join(pQueryDF,
                          parentJoin.pJoinCondition.map(_.sourceKey).toSeq,
                          parentJoin.joinType),
        ptJoin.tail,
        sparkSession,
        gParentSchema)
    }
  }

This is the function for writing into HDFS.

    def writeParquetTemp(df: DataFrame, hdfsPath: String): Unit = {
    df.write.format("parquet").option("compression", "none").mode(SaveMode.Overwrite).save(hdfsPath)
  }

Spark submit configuration:

    /usr/hdp/2.6.3.0-235/spark2/bin//spark-submit --master yarn --deploy-mode client --driver-memory 30G --executor-memory 25G --executor-cores 6 --conf spark.dynamicAllocation.enabled=true --conf spark.shuffle.service.enabled=true --conf spark.sql.autoBroadcastJoinThreshold=774857600 --conf spark.kryoserializer.buffer.max.mb=512 --conf spark.dynamicAllocation.maxExecutors=40 --conf spark.eventLog.enabled=true --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.parquet.binaryAsString=true  --conf spark.sql.broadcastTimeout=36000 --conf spark.sql.shuffle.partitions=500
marc_s
  • 732,580
  • 175
  • 1,330
  • 1,459
Talib aman
  • 19
  • 2

0 Answers0