1

I have been following a tutorial to produce and consume messages from kafka using spark streaming. The idea is to produce simple message which are serialized in avro format. Deserialize the messages from avro format and consume using spark streaming. I am unable to consume the message as the bijection api throws Failed to Invert exception.

Producer:

public static final String schema = "{"
+"\"fields\": ["
+   " { \"name\": \"str1\", \"type\": \"string\" },"
+   " { \"name\": \"str2\", \"type\": \"string\" },"
+   " { \"name\": \"int1\", \"type\": \"int\" }"
+"],"
+"\"name\": \"myrecord\","
+"\"type\": \"record\""
+"}"; 

public static void startAvroProducer() throws InterruptedException{
    Properties props = new Properties();
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"localhost:9092");
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
    props.put(ProducerConfig.CLIENT_ID_CONFIG, "Kafka Avro Producer");

    Schema.Parser parser = new Schema.Parser();
    Schema schema = parser.parse(AvroProducer.schema);

    Injection<GenericRecord, byte[]> inject = GenericAvroCodecs.toBinary(schema);

    KafkaProducer<String,byte[]> producer = new KafkaProducer<String,byte[]>(props);
    for(int i=0;i<1000;i++){
        GenericData.Record record = new GenericData.Record(schema);
        record.put("str1", "str1-"+i);
        record.put("str2", "str2-"+i);
        record.put("int1", i);

        byte[] bytes = inject.apply(record);

        ProducerRecord<String,byte[]> producerRec = new ProducerRecord<String, byte[]>("jason", bytes);
        producer.send(producerRec);
        Thread.sleep(250);

    }

    producer.close();
}

Consumer:

 private static SparkConf sc = null;
        private static JavaSparkContext jsc = null;
        private static JavaStreamingContext jssc = null;
        private static Injection<GenericRecord,byte[]> inject = null;

        static {
            Schema.Parser parser = new Schema.Parser();
            Schema schema = parser.parse(AvroProducer.schema);
            inject = GenericAvroCodecs.apply(schema);
        }

        public static void startAvroConsumer() throws InterruptedException {
            sc = new SparkConf().setAppName("Spark Avro Streaming Consumer")
                    .setMaster("local[*]");
            jsc = new JavaSparkContext(sc);
            jssc = new JavaStreamingContext(jsc, new Duration(200));

            Set<String> topics = Collections.singleton("jason");
            Map<String, String> kafkaParams = new HashMap<String, String>();
            kafkaParams.put("metadata.broker.list", "localhost:9092");
            JavaPairInputDStream<String, byte[]> inputDstream = KafkaUtils
                    .createDirectStream(jssc, String.class, byte[].class,
                            StringDecoder.class, DefaultDecoder.class, kafkaParams,
                            topics);

            inputDstream.map(message -> inject.invert(message._2).get()).foreachRDD(rdd -> {
                    rdd.foreach(record -> {
                        System.out.println(record.get("str1"));
                        System.out.println(record.get("str2"));
                        System.out.println(record.get("int1"));
                    });
                });

            jssc.start();
            jssc.awaitTermination();
        }

Exception:

com.twitter.bijection.InversionFailure: Failed to invert: [B@3679b3f6
    at com.twitter.bijection.InversionFailure$$anonfun$partialFailure$1.applyOrElse(InversionFailure.scala:43)
    at com.twitter.bijection.InversionFailure$$anonfun$partialFailure$1.applyOrElse(InversionFailure.scala:42)
    at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36)
    at scala.util.Failure.recoverWith(Try.scala:203)
    at com.twitter.bijection.Inversion$.attempt(Inversion.scala:32)
    at com.twitter.bijection.avro.GenericAvroCodec.invert(AvroCodecs.scala:293)
    at com.twitter.bijection.avro.GenericAvroCodec.invert(AvroCodecs.scala:276)
    at com.applications.streaming.consumers.AvroConsumer.lambda$0(AvroConsumer.java:54)
    at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1040)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
    at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Not a data file.
    at org.apache.avro.file.DataFileStream.initialize(DataFileStream.java:105)
    at org.apache.avro.file.DataFileStream.<init>(DataFileStream.java:84)
    at com.twitter.bijection.avro.GenericAvroCodec$$anonfun$invert$2.apply(AvroCodecs.scala:295)
    at com.twitter.bijection.avro.GenericAvroCodec$$anonfun$invert$2.apply(AvroCodecs.scala:293)
    at com.twitter.bijection.Inversion$$anonfun$attempt$1.apply(Inversion.scala:32)
    at scala.util.Try$.apply(Try.scala:192)
    ... 18 more
wandermonk
  • 6,856
  • 6
  • 43
  • 93
  • Code looks fine. Have you tried running console consumer to see the messages? It seems like something is not right with the message. Or look at the size of the message, see if it's all zero bytes. – moon Oct 22 '17 at 05:31
  • @Falan console consumer is able to consume the messages produced. – wandermonk Oct 23 '17 at 04:32
  • BTW, for your consumer `kafkaParams`, did you set the key and value `Deserializer` somewhere? – moon Oct 24 '17 at 01:42
  • @Falan i tried setting the key value deserializers using the kafka parameters. It did not work. – wandermonk Oct 24 '17 at 07:28

0 Answers0