-1

I have two spark dfs, i want to do foreach iterator for one df and get particular someId related records from next df.

when i do this every time it occurred java.lang.NullPointerException,

I have posted my code with comments inside the foreach loop. i have tried 3 ways to do this, but every time occurred same error.

Please help me to fixed this issue.

val schListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/scheduled.csv")

schListDf.createOrReplaceTempView("scheduled")

 val trsListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/transaction.csv")

trsListDf.createOrReplaceTempView("transaction")

//THIS WORK FINE

val df3 = spark.sql("select * from transaction limit 5").show()

schListDf.foreach(row => {
if(row(2) != null){

  // I HAVE TRIED THIS WAY FIRST, BUT OCCURRED SAME ERROR
  val df = spark.sql("select * from transaction where  someid = '"+row(2)+"'")

  // I HAVE TRIED THIS WAY SECOND(WITHOUT someID filter), BUT OCCURRED SAME ERROR
  val df2 = spark.sql("select * from transaction limit 5")

  // I HAVE TRIED THIS WAY ALSO(FILTER WITH DF), BUT OCCURRED SAME ERROR
  val filteredDataListDf = trsListDf.filter($"someid" === row(2))
}

})

18/12/02 10:36:34 ERROR Executor: Exception in task 0.0 in stage 4.0 (TID 4) java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 18/12/02 10:36:34 ERROR Executor: Exception in task 3.0 in stage 4.0 (TID 7) java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 18/12/02 10:36:34 ERROR Executor: Exception in task 1.0 in stage 4.0 (TID 5)

java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 18/12/02 10:36:34 ERROR Executor: Exception in task 2.0 in stage 4.0 (TID 6) java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) 18/12/02 10:36:34 WARN TaskSetManager: Lost task 2.0 in stage 4.0 (TID 6, localhost, executor driver): java.lang.NullPointerException at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142) at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52) at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48) at scala.collection.Iterator$class.foreach(Iterator.scala:891) at scala.collection.AbstractIterator.foreach(Iterator.scala:1334) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

1 Answers1

1

Some Spark aspects are Driver related.

A DF cannot be accessed from within a foreach which implies Executor side.

That is the paradigm. Same applies to RDDs and Spark Session.

That is to say, foreach is fine, but not with a val DF or spark.sql. You would need a while loop, for example.

This is a common misconception when one starts out with Spark it appears.

thebluephantom
  • 16,458
  • 8
  • 40
  • 83