0

I am writing a Kafka source connector based on a working producer that I use for audio files. The connector starts but nothing happens, no errors, no data, I am not sure if this is a coding problem or configuration problem.

The connector should read an entire directory, and read files as a byte array.

Config class:

package hothman.example;

import org.apache.kafka.common.config.AbstractConfig;
import org.apache.kafka.common.config.ConfigDef;
import org.apache.kafka.common.config.ConfigDef.Type;
import org.apache.kafka.common.config.ConfigDef.Importance;
import java.util.Map;



public class AudioSourceConnectorConfig extends AbstractConfig {

  public static final String FILENAME_CONFIG="fileName";
  private static final String FILENAME_DOC ="Enter the path of the audio files";

  public static final String TOPIC_CONFIG = "topic";
  private static final String TOPIC_DOC = "Enter the topic to write to..";



  public AudioSourceConnectorConfig(ConfigDef config, Map<String, String> parsedConfig) {
    super(config, parsedConfig);
  }

  public AudioSourceConnectorConfig(Map<String, String> parsedConfig) {
    this(conf(), parsedConfig);
  }

  public static ConfigDef conf() {
    return new ConfigDef()
            .define(FILENAME_CONFIG, Type.STRING, Importance.HIGH, FILENAME_DOC)
            .define(TOPIC_CONFIG, Type.STRING, Importance.HIGH, TOPIC_DOC);

  }

  public String getFilenameConfig(){
    return this.getString("fileName");
  }
  public String getTopicConfig(){
    return this.getString("topic");
  }
}

SourceConnectorClass

package hothman.example;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.kafka.common.config.ConfigDef;
import org.apache.kafka.connect.connector.Task;
import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.source.SourceConnector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;



public class AudioSourceConnector extends SourceConnector {
  /*
    Your connector should never use System.out for logging. All of your classes should use slf4j
    for logging
 */
  private static Logger log = LoggerFactory.getLogger(AudioSourceConnector.class);

  private AudioSourceConnectorConfig config;
  private String filename;
  private String topic;

  @Override
  public String version() {
    return VersionUtil.getVersion();
  }

  @Override
  public void start(Map<String, String> props) {
    filename = config.getFilenameConfig();
    topic = config.getTopicConfig();
    if (topic == null || topic.isEmpty())
      throw new ConnectException("AudiSourceConnector configuration must include 'topic' setting");
    if (topic.contains(","))
      throw new ConnectException("AudioSourceConnector should only have a single topic when used as a source.");
  }

  @Override
  public Class<? extends Task> taskClass() {
    //TODO: Return your task implementation.
    return AudioSourceTask.class;
  }

  @Override
  public List<Map<String, String>> taskConfigs(int maxTasks) {
    ArrayList<Map<String, String>> configsList = new ArrayList<>();
    // Only one input stream makes sense.
    Map<String, String> configs = new HashMap<>();
    if (filename != null)
      configs.put(config.getFilenameConfig(), filename);
    configs.put(config.getTopicConfig(), topic);
    configsList.add(configs);
    return configsList;
  }

  @Override
  public void stop() {

  }
  @Override
  public ConfigDef config() {
    return AudioSourceConnectorConfig.conf();

  }
}

SourceTask class

package hothman.example;


import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.SchemaAndValue;
import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.source.SourceRecord;
import org.apache.kafka.connect.source.SourceTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;

import java.nio.file.*;
import java.util.*;

import static com.sun.nio.file.ExtendedWatchEventModifier.FILE_TREE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_CREATE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_DELETE;

public class AudioSourceTask extends SourceTask {
  /*
    Your connector should never use System.out for logging. All of your classes should use slf4j
    for logging
 */
  static final Logger log = LoggerFactory.getLogger(AudioSourceTask.class);

  private AudioSourceConnectorConfig config;
  public static final String POSITION_FIELD = "position";
  private static final Schema VALUE_SCHEMA = Schema.BYTES_SCHEMA;

  private String filename;
  private String topic = null;
  private int offset = 0;


  private FileSystem fs = FileSystems.getDefault();
  private WatchService ws = fs.newWatchService();

  private Path dir;
  private File directoryPath;
  private ArrayList<File> listOfFiles;
  private byte[] temp = null;


  public AudioSourceTask() throws IOException {
  }

  @Override
  public String version() {
    return VersionUtil.getVersion();
  }

  @Override
  public void start(Map<String, String> props) {
    filename = config.getFilenameConfig();
    topic = config.getTopicConfig();
    if (topic == null)
      throw new ConnectException("AudioSourceTask config missing topic setting");

    dir = Paths.get(filename);
    try {
      dir.register(ws, new WatchEvent.Kind[]{ENTRY_CREATE, ENTRY_DELETE}, FILE_TREE);
    } catch (IOException e) {
      e.printStackTrace();
    }

    directoryPath = new File(String.valueOf(dir));
  }


  @Override
  public List<SourceRecord> poll() throws InterruptedException {
    //TODO: Create SourceRecord objects that will be sent the kafka cluster.

    listOfFiles = new ArrayList<File>(Arrays.asList(directoryPath.listFiles()));
    Map<String, Object> offset = context.offsetStorageReader().
            offset(Collections.singletonMap(config.getFilenameConfig(), filename));


    ArrayList<SourceRecord> records = new ArrayList<>(1);

    try {
      for (File file : listOfFiles) {
        // send existing files first
        temp = Files.readAllBytes(Paths.get(file.toString()));

        records.add(new SourceRecord(null,
                null, topic, Schema.BYTES_SCHEMA, temp));
       
      }

      return records;
    } catch (IOException e) {
      e.printStackTrace();
    }
    return null;
  }

  @Override
  public void stop() {
    //TODO: Do whatever is required to stop your task.
  }




}

VersionClass

package hothman.example;

/**
 * Created by jeremy on 5/3/16.
 */
class VersionUtil {
  public static String getVersion() {
    try {
      return VersionUtil.class.getPackage().getImplementationVersion();
    } catch(Exception ex){
      return "0.0.0.0";
    }
  }
}

Connector.properties

name=AudioSourceConnector
tasks.max=1
connector.class=hothman.example.AudioSourceConnector


fileName = G:\\Files
topic= my-topic

Connect-standalone.properties

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# These are defaults. This file just demonstrates how to override some settings.
bootstrap.servers=localhost:9092

# The converters specify the format of data in Kafka and how to translate it into Connect data. Every Connect user will
# need to configure these based on the format they want their data in when loaded from or stored into Kafka
#key.converter=org.apache.kafka.connect.json.JsonConverter
#value.converter=org.apache.kafka.connect.json.JsonConverter


key.converter=org.apache.kafka.connect.storage.StringConverter
value.converter=org.apache.kafka.connect.converters.ByteArrayConverter


# Converter-specific settings can be passed in by prefixing the Converter's setting with the converter we want to apply
# it to
key.converter.schemas.enable=false
value.converter.schemas.enable=false

offset.storage.file.filename=G:/Kafka/kafka_2.12-2.8.0/tmp/connect.offsets
# Flush much faster than normal, which is useful for testing/debugging
offset.flush.interval.ms=10000

# Set to a list of filesystem paths separated by commas (,) to enable class loading isolation for plugins
# (connectors, converters, transformations). The list should consist of top level directories that include 
# any combination of: 
# a) directories immediately containing jars with plugins and their dependencies
# b) uber-jars with plugins and their dependencies
# c) directories immediately containing the package directory structure of classes of plugins and their dependencies
# Note: symlinks will be followed to discover dependencies or plugins.
# Examples: 
# plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors,
plugin.path=G:/Kafka/kafka_2.12-2.8.0/plugins

ERROR:

[2021-05-05 01:24:27,926] INFO WorkerSourceTask{id=AudioSourceConnector-0} flushing 0 outstanding messages for offset commit (org.apache.kafka.connect.runtime.WorkerSourceTask:487)
[2021-05-05 01:24:27,928] ERROR WorkerSourceTask{id=AudioSourceConnector-0} Task threw an uncaught and unrecoverable exception. Task is being killed and will not recover until manually restarted (org.apache.kafka.connect.runtime.WorkerTask:184)
java.lang.OutOfMemoryError: Java heap space
        at java.nio.file.Files.read(Files.java:3099)
        at java.nio.file.Files.readAllBytes(Files.java:3158)
        at hothman.example.AudioSourceTask.poll(AudioSourceTask.java:93)
        at org.apache.kafka.connect.runtime.WorkerSourceTask.poll(WorkerSourceTask.java:273)
        at org.apache.kafka.connect.runtime.WorkerSourceTask.execute(WorkerSourceTask.java:240)
        at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:182)
        at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:231)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
[2021-05-05 01:24:27,929] INFO [Producer clientId=connector-producer-AudioSourceConnector-0] Closing the Kafka producer with timeoutMillis = 30000 ms. (org.apache.kafka.clients.producer.KafkaProducer:1204)
[2021-05-05 01:24:27,933] INFO Metrics scheduler closed (org.apache.kafka.common.metrics.Metrics:659)
[2021-05-05 01:24:27,934] INFO Closing reporter org.apache.kafka.common.metrics.JmxReporter (org.apache.kafka.common.metrics.Metrics:663)
[2021-05-05 01:24:27,934] INFO Metrics reporters closed (org.apache.kafka.common.metrics.Metrics:669)
[2021-05-05 01:24:27,935] INFO App info kafka.producer for connector-producer-AudioSourceConnector-0 unregistered (org.apache.kafka.common.utils.AppInfoParser:83)
[2021-05-05 01:24:36,479] INFO WorkerSourceTask{id=AudioSourceConnector-0} flushing 0 outstanding messages for offset commit (org.apache.kafka.connect.runtime.WorkerSourceTask:487)
  • Have you tried using the logger that you created to actual see where execution ends up? – OneCricketeer May 04 '21 at 23:38
  • Or any unit test that actually verifies you're reading the directory of files into an arraylist correctly? Also, certainly you want the name of the file as part of the record? Otherwise, you wouldn't really know if you've already processed the same byte array. You never did respond to my comment about setting up a debugger https://stackoverflow.com/questions/66766961/kafka-connect-for-audio-files#comment118038192_66766961 – OneCricketeer May 04 '21 at 23:43
  • I am trying to use the logger but not sure how or where I am reading about it. same for setting up a debugger. I am using the same code to read a directory of different types of audio files into ArrayList using a producer and it is working that's why I thought that just trying to convert the producer into a connector should not be that difficult. – AinzOwlGown May 05 '21 at 04:49
  • 1
    For starters, in the poll method, you could write `log.info("Reading file {}", file)`, then if it never prints anything when you run the connector, you know it never reached that line, and so add logs in the start of taskConfigs method to print other useful information, or just `log.info("got here")`. And if you have separate code that works as you expect, it'd be best to package that as classes that you'd import and expose through interfaces rather than directly porting into the Connect worker methods – OneCricketeer May 05 '21 at 04:55
  • log.info("got here") inside taskConfigs got printed, the one inside the poll method didn't, Also tried to print from inside start method in the task class but also nothing happend – AinzOwlGown May 05 '21 at 05:04
  • 1
    found the problem! `config.getFilenameConfig();` is not returning anything, so i encoded the path manually now the task is running but i ran into the error that i posted in the question above – AinzOwlGown May 05 '21 at 05:28
  • 1
    I solved the above error and the connector is now working. it is all thanks to you telling me to use the logger. I also had to edit **connect-standalone.properties** file and change the size of **producer.max.request.size** and **producer.buffer.memory** . now the only thing that is not working properly is listening to newly added files, whenever i add a file to the directory the connector gives **The process cannot access the file because it is being used by another process** i guess i have to give it a waiting time. and also find out why **config.getFilenameConfig** is not working – AinzOwlGown May 05 '21 at 07:20
  • 1
    If it's solved, feel free to post your solution below. And yeah, you're never using the WatchService at all, but one idea would be create an ArrayDeque object that gets populated on file creation events, then your poll method will grab files off of the queue rather than list the full directory content. I'm also not sure if the WatchService initially gathers existing files or not... In any case, the OOM error can be fixed by exporting `KAFKA_HEAP_OPTS="-Xmx4g"` before starting connect if you want to double the memory – OneCricketeer May 05 '21 at 13:53

1 Answers1

0

Using the Logger based on @OneCricketeer recommendation, I was able to pinpoint the problem.

config.getFilenameConfig();

returns null, so I had to encode the path manually for the time being in the connector.

the connector worked but gave java.lang.OutOfMemoryError: Java heap space error. to fix this i had to edit connect-standalone.properties file and change the size of producer.max.request.size and producer.buffer.memory and make sure their values are higher than any of the files that I am going to send.

I have also edited AudioSourceTask class and got rid of the for loop in the poll method and moved the initialization of listOfFiles from poll method to start method, they are as follows now

     public void start(Map<String, String> props) {
    
            filename = "G:\\AudioFiles";//config.getFilenameConfig();//
            topic = "voice-wav1";//config.getTopicConfig();//
            if (topic == null)
              throw new ConnectException("AudioSourceTask config missing topic setting");
        
            dir = Paths.get(filename);
            try {
              dir.register(ws, new WatchEvent.Kind[]{ENTRY_CREATE, ENTRY_DELETE}, FILE_TREE);
            } catch (IOException e) {
              e.printStackTrace();
            }
        
            directoryPath = new File(String.valueOf(dir));
            listOfFiles = new ArrayList<File>(Arrays.asList(directoryPath.listFiles()));
          }

  @Override
  public List<SourceRecord> poll() throws InterruptedException {
    //TODO: Create SourceRecord objects that will be sent the kafka cluster.


    Map<String, Object> offset = context.offsetStorageReader().
            offset(Collections.singletonMap("G:\\AudioFiles", filename));


    ArrayList<SourceRecord> records = new ArrayList<>(1);

    try{
      
        // send existing files first
      if(listOfFiles.size()!=0) {
        File file = listOfFiles.get(listOfFiles.size() - 1);

        listOfFiles.remove(listOfFiles.size() - 1);
        temp = Files.readAllBytes(Paths.get(file.toString()));
        records.add(new SourceRecord(null,
                null, topic, Schema.BYTES_SCHEMA, temp));
        LOGGER.info("Reading file {}", file);
        return records;
      }

        
    } catch (IOException e) {
      e.printStackTrace();
    }

    return null;
  }