0

I have a very small illustrative Apache Beam pipeline that I'm trying to run with SparkRunner.

Below is the pipeline code

public class SparkMain {

    public static void main(String[] args) {
        PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create();
        Pipeline pipeline = Pipeline.create(options);
        final String projectId = "<my-project-id>";
        final String dataset = "test_dataset";
        Duration durations = DurationUtils.parseDuration("10s");

        pipeline.apply("Read from PubSub",
                PubsubIO.readMessagesWithAttributes().fromSubscription("my-subscription"))
                .apply("Window",Window.<PubsubMessage>into(new GlobalWindows()).triggering(AfterWatermark.pastEndOfWindow()
                        .withEarlyFirings(AfterFirst.of(AfterPane.elementCountAtLeast(10),
                                AfterProcessingTime.pastFirstElementInPane().plusDelayOf(durations))))
                        .discardingFiredPanes())
                .apply("Convert to String", ParDo.of(new DoFn<PubsubMessage, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext context){
                        PubsubMessage msg = context.element();
                        String msgStr = new String(msg.getPayload());
                        context.output(msgStr);
                    }
                }))
                .apply("Write to File", TextIO
                        .write()
                        .withWindowedWrites()
                        .withNumShards(1)
                        .to("/Users/my-user/Documents/spark-beam-local/windowed-output"));


        pipeline.run();
    }
}

I'm using Apache Beam 2.16.0 and Spark 2.4.4 in local mode.

When I try to run this pipeline with DirectRunner or DataflowRunner everything works fine but when I switch the runner to SparkRunner the tasks start failing with following exception.

19/12/17 12:15:45 INFO MicrobatchSource: No cached reader found for split: [org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSource$PubsubSource@46d6c879]. Creating new reader at checkpoint mark null
19/12/17 12:15:46 WARN BlockManager: Putting block rdd_7_9 failed due to exception java.lang.NullPointerException.
19/12/17 12:15:46 WARN BlockManager: Block rdd_7_9 could not be removed as it was not found on disk or in memory
19/12/17 12:15:46 ERROR Executor: Exception in task 9.0 in stage 2.0 (TID 9)
java.lang.NullPointerException
    at org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSource$PubsubReader.getWatermark(PubsubUnboundedSource.java:941)
    at org.apache.beam.runners.spark.io.MicrobatchSource$Reader.getWatermark(MicrobatchSource.java:291)
    at org.apache.beam.runners.spark.stateful.StateSpecFunctions$1.apply(StateSpecFunctions.java:181)
    at org.apache.beam.runners.spark.stateful.StateSpecFunctions$1.apply(StateSpecFunctions.java:107)
    at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:181)
    at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:180)
    at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:57)
    at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:55)
    at scala.collection.Iterator$class.foreach(Iterator.scala:891)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
    at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$.updateRecordWithData(MapWithStateRDD.scala:55)
    at org.apache.spark.streaming.rdd.MapWithStateRDD.compute(MapWithStateRDD.scala:159)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)
    at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)
    at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
    at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
    at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
    at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
    at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
    at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:123)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

I'm using following spark-submit

spark-submit --class SparkMain \
--master local[*] target/beam-1.0-SNAPSHOT.jar \
--runner=SparkRunner \
--project=<my-project> \
--gcpTempLocation=gs://<my-bucket>/temp \
--checkpointDir=/Users/my-user/Documents/beam-tmp/

There is a very similar but un-answered question Apache Beam pipeline with PubSubIO error using Spark Runner PubsubUnboundedSource$PubsubReader.getWatermark(PubsubUnboundedSource.java:1030)

Can anybody point me to direction how to start debugging the problem ?

Oli
  • 9,766
  • 5
  • 25
  • 46
kaysush
  • 4,797
  • 3
  • 27
  • 47
  • Don't know but removing `--runner=SparkRunner` seemed to work and solve the issue. – kaysush Dec 17 '19 at 11:04
  • My bad, removing `--runner=SparkRunner` was causing the pipeline to run with `DirectRunner`, hence it started working. – kaysush Dec 30 '19 at 03:21

0 Answers0