1

I am new to spark streaming and trying to implement kafka, MongoDB integration where my code pulls the JSON data from Kafka topic and inserts into MongoDB. Below is my code

package com.streams.sparkmongo

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import com.mongodb.spark._

object MongoStream {

  def main(args: Array[String]) {

    //Initializing Spark
    val conf = SparkSession.builder()
      .master("spark://localhost:7077")
      .appName("MongoStream")
      .config("spark.mongodb.output.uri", "mongodb://localhost/test.myCollection")
      .getOrCreate()

    //Defining Kafka configuration parameters
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "mongostream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (true: java.lang.Boolean),
      "zookeeper.connect" -> "localhost:2181",
      "zookeeper.connection.timeout.ms" -> "10000",
      "zookeeper.session.timeout.ms" -> "10000")

    // Create a StreamingContext with a 10 second batch size from a SparkConf
    val ssc = new StreamingContext(conf.sparkContext, Seconds(10))

    //Defining Kafka topic
    val topics = Array("mongostreamtest")

    val stream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))

    val elementDstream = stream.map(v => v.value).foreachRDD { rdd =>
      import conf.implicits._
      //val payload = conf.read.json(rdd)
      rdd.saveToMongoDB()
    }
    ssc.start()
    ssc.awaitTermination
  }

}

I am looping to each RDD and saving it to MongoDB using helper method saveToMongoDB(). I am getting the following error. Any inputs would be much appreciated.

WARN TaskSetManager: Lost task 0.2 in stage 9.0 (TID 11, <<ip>>, executor 0): com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting for a server that matches WritableServerSelector. Client view of cluster state is {type=REPLICA_SET, servers=[{address=localhost:27017, type=REPLICA_SET_SECONDARY, roundTripTime=0.6 ms, state=CONNECTED}]
        at com.mongodb.connection.BaseCluster.createTimeoutException(BaseCluster.java:377)
        at com.mongodb.connection.BaseCluster.selectServer(BaseCluster.java:104)
        at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:75)
        at com.mongodb.binding.ClusterBinding$ClusterBindingConnectionSource.<init>(ClusterBinding.java:71)
        at com.mongodb.binding.ClusterBinding.getWriteConnectionSource(ClusterBinding.java:68)
        at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:411)
        at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:168)
        at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:74)
        at com.mongodb.Mongo.execute(Mongo.java:845)
        at com.mongodb.Mongo$2.execute(Mongo.java:828)
        at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:338)
        at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:322)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
        at scala.collection.Iterator$class.foreach(Iterator.scala:893)
        at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:119)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:118)
        at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:186)
        at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:184)
        at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
        at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:118)
        at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:117)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
        at org.apache.spark.scheduler.Task.run(Task.scala:108)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
Ruslan Ostafiichuk
  • 4,422
  • 6
  • 30
  • 35
user8363477
  • 655
  • 4
  • 14
  • 24
  • Could you provide more information about your MongoDB deployment ? i.e. what command line did you execute to run the MongoDB server ? Is is a standalone/replicaset ? – Wan B. Mar 28 '18 at 03:58

0 Answers0