0

I have a Java Spark application, which I run using Spark-submit command. My Specific use case involves fetching offsets data from Kafka using spark consumer in a Dataframe (Dataset<Row>) and performing transformation(spark sql) on each row of this dataframe.

Issue: when I use following command where Spark.master is "yarn", line of code inside foreach does nothing. I cannot see any Log on terminal and if we try to do some transformation on this it gives NullPointerException. But when we use Spark.master as "local" in this command we can see each row in Log. (In case of information on why yarn and why local : https://spark.apache.org/docs/latest/submitting-applications.html#master-urls

This is the command I am using:

spark-submit --conf spark.app.name=SparkHwcIntegration --conf spark.master=yarn --conf spark.submit.deployMode=client --conf spark.driver.memory=1g --conf spark.executor.memory=1g --conf spark.driver.cores=1 --conf spark.executor.cores=3 --conf spark.executor.instances=2 --class "com.saurabh.kafkaExtractorMain" --jars kafka-spark-1.0.jar

Java code snippet for which I am experiencing this issue:

package com.saurabh;

import com.saurabh.SparkSessionSingleton;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.get_json_object;
import static org.apache.spark.sql.types.DataTypes.StringType;

public class SparkDfSam {
    public static Logger LOG = LoggerFactory.getLogger(SparkDfSam.class);

    public static void extractKafkaDataframeFromSparkNew() {
        LOG.info("::Creating Session::");
        SparkSession spark = SparkSessionSingleton.getInstance();

        LOG.info("::Spark Read() start::");

        Dataset<Row> df = spark
                .readStream()
                .format("kafka")
                .option("kafka.bootstrap.servers", "<List of kafka broker>")
                .option("subscribe", "<TopicName>")
                .option("startingOffsets", "earliest")
                .load();
        df.printSchema();
        LOG.info("-------------------Spark ReadStream() end -------------------");

        Dataset<Row> processedDF =
                df.withColumn("key2", col("key").cast(StringType))
                        .withColumn("value2", col("value").cast(StringType)).drop("key", "value", "topic", "partition", "offset", "timestamp", "timestampType");
        Dataset<Row> colRep = processedDF.withColumn("new", get_json_object(col("value2"), "$.repeatedMessages"));
        LOG.info("-------------------Spark writeStream() start -------------------");
        StreamingQuery query = colRep.writeStream()
                .foreachBatch((batchDf, batchId) -> {

                    batchDf.foreach(row -> {
                        LOG.info("Row String value is: {}", row.getString(0)); // this logs nothing
                        LOG.info("Row value: {}", row); // this also logs nothing
                    });


                }).start();

        try {
            query.awaitTermination(30 * 1000); // 30s
        } catch (StreamingQueryException e) {
            throw new RuntimeException(e);
        }

        LOG.info("-------------------Spark writeStream() end -------------------");
    }

}

Sample Json data read from Kafka:

{
  "startTransaction": true,
  "transactionId": "REFERENCE_TABLES",
  "endTransaction": true,
  "repeatedMessages": {
    "HortonWorks": [
      {
        "productCatalogParameterRelKey": "testProductKey",
        "queryTime": 1002839,
        "transactionTime": 18929,
        "catalogItemName": "testCatalogItem",
        "endDate": 978,
        "itemRoleInd": "OC"
      }
    ],
    "HiveWHC": [
      {
        "HWC": 1687238823610,
        "Amd": "SAMSON_100009925_MEMO_947067743",
        "Kart": 1687264015000,
        "customerKey": "SAMSON_100009925",
        "systemMessage": "Message"
      },
      {
        "HWC": 78244,
        "Amd": "",
        "Kart": 1687264015000,
        "customerKey": "Apple",
        "systemMessage": "Message-Json"
      }
    ]
  }
}
James Z
  • 12,209
  • 10
  • 24
  • 44

0 Answers0