0

I referred the following link to understand HDFS Connect for Kafka https://docs.confluent.io/2.0.0/connect/connect-hdfs/docs/index.html
I was able to export data from kafka to HDFS with hive integration.
Now I am trying to write avro records to kafka with help of a Java program

public static void main(String[] args) throws InterruptedException,IOException,RestClientException{

    Properties props = new Properties();
    props.put("bootstrap.servers", "localhost:9094");
    props.put("acks", "all");
    props.put("retries", 0);
    props.put("key.serializer", "io.confluent.kafka.serializers.KafkaAvroSerializer");
    props.put("value.serializer", "io.confluent.kafka.serializers.KafkaAvroSerializer");
    props.put("schema.registry.url", "http://10.15.167.109:8084");

    Producer<String, GenericRecord> producer = new KafkaProducer<String, GenericRecord>(props);

Schema schema= SchemaRegstryClient.getLatestSchema("StreamExample_1");
//    Random rnd = new Random();
    for (int i = 0; i < 1000; i++) {

      GenericRecord avroRecord = new GenericData.Record(schema);
       avroRecord.put("str1", i);
       avroRecord.put("str2",i+1);
      ProducerRecord<String, GenericRecord> data = new ProducerRecord<String, GenericRecord>(
          "StreamExample_1", ""+new Integer(i), avroRecord);
      producer.send(data);
         Thread.sleep(250);
    }

    producer.close();
  }

Schema registered in Schema Registry with name StreamExample_1

{
            "type": "record",
            "name": "StreamExample_1",
            "fields": [
                {
                    "name": "str1",
                    "type": "int",

                },
                {
                    "name": "str2",
                    "type": "int",

                }
               ]
        }

Following is my hdfs properties file

name=hdfs-sink
connector.class=io.confluent.connect.hdfs.HdfsSinkConnector
tasks.max=1
topics=StreamExample_1
hdfs.url=hdfs://localhost:9000
flush.size=3
hive.metastore.uris=thrift://10.15.167.109:9083
hive.integration=true
schema.compatibility=BACKWARD
format.class=io.confluent.connect.hdfs.parquet.ParquetFormat
partitioner.class=io.confluent.connect.hdfs.partitioner.HourlyPartitioner
locale=en-us
timezone=UTC
key.converter=org.apache.kafka.connect.storage.StringConverter
key.converter.schema.registry.url=http://localhost:8084
value.converter=io.confluent.connect.avro.AvroConverter
value.converter.schema.registry.url=http://localhost:8084

When I am writing Avro record to Kafka topic, I get the following error in Connect

  org.apache.kafka.connect.errors.DataException: StreamExample_1
        at io.confluent.connect.avro.AvroConverter.toConnectData(AvroConverter.java:96)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.convertMessages(WorkerSinkTask.java:454)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:287)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:198)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:166)
        at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:170)
        at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:214)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.common.errors.SerializationException: Error deserializing Avro message for id 101
Caused by: java.net.ConnectException: Connection refused (Connection refused)
        at java.net.PlainSocketImpl.socketConnect(Native Method)
        at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
        at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
        at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
        at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
        at java.net.Socket.connect(Socket.java:589)
        at java.net.Socket.connect(Socket.java:538)
        at sun.net.NetworkClient.doConnect(NetworkClient.java:180)
        at sun.net.www.http.HttpClient.openServer(HttpClient.java:463)
        at sun.net.www.http.HttpClient.openServer(HttpClient.java:558)
        at sun.net.www.http.HttpClient.<init>(HttpClient.java:242)
        at sun.net.www.http.HttpClient.New(HttpClient.java:339)
        at sun.net.www.http.HttpClient.New(HttpClient.java:357)
        at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient(HttpURLConnection.java:1220)
        at sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1156)
        at sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1050)
        at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:984)
        at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1564)
        at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1492)
        at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
        at io.confluent.kafka.schemaregistry.client.rest.RestService.sendHttpRequest(RestService.java:174)
        at io.confluent.kafka.schemaregistry.client.rest.RestService.httpRequest(RestService.java:218)
        at io.confluent.kafka.schemaregistry.client.rest.RestService.getId(RestService.java:394)
        at io.confluent.kafka.schemaregistry.client.rest.RestService.getId(RestService.java:387)
        at io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient.getSchemaByIdFromRegistry(CachedSchemaRegistryClient.java:65)
        at io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient.getBySubjectAndId(CachedSchemaRegistryClient.java:138)
        at io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer.deserialize(AbstractKafkaAvroDeserializer.java:122)
        at io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer.deserializeWithSchemaAndVersion(AbstractKafkaAvroDeserializer.java:194)
        at io.confluent.connect.avro.AvroConverter$Deserializer.deserialize(AvroConverter.java:121)
        at io.confluent.connect.avro.AvroConverter.toConnectData(AvroConverter.java:84)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.convertMessages(WorkerSinkTask.java:454)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:287)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:198)
        at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:166)
        at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:170)
        at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:214)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
[2018-03-12 08:59:25,070] ERROR WorkerSinkTask{id=hdfs-sink-0} Task is being killed and will not recover until manually restarted (org.apache.kafka.connect.runtime.WorkerTask:173)
[2018-03-12 08:59:25,070] INFO Shutting down Hive executor service. (io.confluent.connect.hdfs.DataWriter:471)
[2018-03-12 08:59:25,070] INFO Awaiting termination. (io.confluent.connect.hdfs.DataWriter:476)
Madhu
  • 121
  • 1
  • 16

1 Answers1

2

Not sure why you are still using byte[] in your Producer when you can actually use a Avro object.

Also, you have sent no key, so it is not clear why you have set the value serializer to be the Avro one. I would suggest setting the integers in your loop as keys.

props.put("key.serializer", "org.apache.kafka.common.serialization.IntegerSerializer");
Producer<Integer, GenericRecord> producer = new KafkaProducer<Integer, GenericRecord>(props);

for (int i = 0; i < 1000; i++) {
    GenericData.Record avroRecord = new GenericData.Record(schema);
    avroRecord.put("str1", "Str 1-" + i);
    avroRecord.put("str2", "Str 2-" + i);
    avroRecord.put("int1", i);

    ProducerRecord<String, GenericRecord> data = new ProducerRecord<String, GenericRecord>("StreamExample_1", new Integer(i), avroRecord);
    producer.send(data);
}

producer.close();

Refer Confluent example code

If you want to use Kafka Connect with Avro data, you need to update the value converter to

value.converter=io.confluent.connect.avro.AvroConverter
OneCricketeer
  • 179,855
  • 19
  • 132
  • 245
  • Thanks for the response, As you suggested I am sending avro Record instead of byte[] and added value.converter=io.confluent.connect.avro.AvroConverter I get following exception after the modifications org.apache.kafka.connect.errors.DataException: mytopic – Madhu Mar 12 '18 at 09:06
  • One line of an exception doesn't help me answer your problem. – OneCricketeer Mar 12 '18 at 09:11
  • I am unable to post stack trace in comments, So I edited the question accordingly. Changed topic name from mytopic to StreamExample_1 – Madhu Mar 12 '18 at 09:28
  • I could see StreamExample_1-value, StreamExample_1-key got created in schema registry – Madhu Mar 12 '18 at 10:53
  • 1) `key.converter.schema.registry.url` is not needed. The keys in my answer are not Avro. So you should have `key.converter.schemas.enable=false` 2) `java.net.ConnectException: Connection refused` means your Schema Registry is either not running, or the port number is wrong... – OneCricketeer Mar 12 '18 at 17:05
  • I could not resolve the issue even when schema registry is running. I destroyed confluent services and restarted all services. Finally, I am able to post data to hive using Kafka Producer. Thanks for the support – Madhu Mar 13 '18 at 08:00