0

I wrote a simple consumer-producer pattern to help me achieve the following task:

  1. Read a file from a directory that contain ~500,000 TSV (tab-separated) files.
  2. Manipulate each file into a data structure and put it in blocking queue.
  3. Consume the queue using consumer and query DB.
  4. Compare both hash-maps and if there's a difference, print difference to file.

When I run the program, even with 5 threads, my CPU consumption skyrockets to 100%. Could this be because I'm using a single producer to read the files?

File example (tab separated)

Column1   Column2   Column3   Column 4   Column5
A         1         *         -          -
B         1         *         -          -
C         1         %         -          -

Producer

public class Producer implements Runnable{
private BlockingQueue<Map<String, Map<String, String>>> m_Queue;
private String m_Directory;

public Producer(BlockingQueue<Map<String, Map<String, String>>> i_Queue, String i_Directory)
{
    m_Queue = i_Queue;
    m_Directory = i_Directory;
}

@Override
public void run()
{
    if (Files.exists(Paths.get(m_Directory)))
    {
        File[] files = new File(m_Directory).listFiles();

        if (files != null)
        {
            for (File file : files)
            {
                Map<String, String> map = new HashMap<>();
                try (BufferedReader reader = new BufferedReader(new FileReader(file)))
                {
                    String line, lastcolumn3 = "", column1 = "", column2 = "", column3 = "";
                    while ((line = reader.readLine()) != null)
                    {
                        //Skip column header
                        if (!Character.isLetter(line.charAt(0)))
                        {
                            String[] splitLine = line.split("\t");

                            column1 = splitLine[0].replace("\"", "");
                            column2 = splitLine[1].replace("\"", "");
                            column3 = splitLine[2].replace("\"", "");

                            if (!lastcolumn3.equals(column3))
                            {
                                map.put(column3, column1);
                                lastcolumn3 = column3;
                            }
                        }
                    }

                    map.put(column3, column1);

                    //Column 1 is always the same per file, it'll be the key. Column2 and Column3 are stored as the value (as a key-value pair)
                    Map<String, Map<String, String>> mapPerFile = new HashMap<>();
                    mapPerFile.put(column2, map);

                    m_Queue.put(mapPerFile);
                }
                catch (IOException | InterruptedException e)
                {
                    System.out.println(file);
                    e.printStackTrace();
                }
            }
        }
    }
}}

Consumer

public class Consumer implements Runnable{
private HashMap<String, String> m_DBResults;
private BlockingQueue<Map<String, Map<String, String>>> m_Queue;
private Map<String, Map<String, String>> m_DBResultsPerFile;
private String m_Column1;
private int m_ThreadID;

public Consumer(BlockingQueue<Map<String, Map<String, String>>> i_Queue, int i_ThreadID)
{
    m_Queue = i_Queue;
    m_ThreadID = i_ThreadID;
}

@Override
public void run()
{
    try
    {
        while ((m_DBResultsPerFile = m_Queue.poll()) != null)
        {
            //Column1 is always the same, only need the first entry.
            m_Column1 = m_DBResultsPerFile.keySet().toArray()[0].toString();

            //Queries DB and puts returned data into m_DBResults
            queryDB(m_Column1);

            //Write the difference, if any, per thread into a file.
            writeDifference();
        }
    }
    catch (Exception e)
    {
        e.printStackTrace();
    }
}

private void writeDifference()
{
    MapDifference<String, String> difference = Maps.difference(m_DBResultsPerFile.get(m_Column1), m_DBResults);

    if (difference.entriesOnlyOnLeft().size() > 0 || difference.entriesOnlyOnRight().size() > 0)
    {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(String.format("thread_%d.tsv", m_ThreadID), true)))
        {
            if (difference.entriesOnlyOnLeft().size() > 0)
            {
                writer.write(String.format("%s\t%s\t", "Missing", m_Column1));
                for (Map.Entry<String, String> entry : difference.entriesOnlyOnLeft().entrySet())
                {
                    writer.write(String.format("[%s,%s]; ", entry.getKey(), entry.getValue()));
                }

                writer.write("\n");
            }
            if (difference.entriesOnlyOnRight().size() > 0)
            {
                writer.write(String.format("%s\t%s\t", "Extra", m_Column1));
                for (Map.Entry<String, String> entry : difference.entriesOnlyOnRight().entrySet())
                {
                    writer.write(String.format("[%s,%s]; ", entry.getKey(), entry.getValue()));
                }

                writer.write("\n");
            }
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
    }
}}

Main

public static void main(String[]args) {
BlockingQueue<Map<String, Map<String,String>>> queue = new LinkedBlockingQueue <> ();

//Start the reader thread.
threadPool.execute(new Producer(queue, args[0]));

//Create configurable threads.
for (int i = 0; i < 10; i++) {
    threadPool.execute(new Consumer(queue, i + 1));
}

threadPool.shutdown();
System.out.println("INFO: Shutting down threads.");

try {
    threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
    System.out.println("INFO: Threadpool terminated successfully.");
} catch (InterruptedException e) {
    e.printStackTrace();
}}
ocp1000
  • 571
  • 1
  • 5
  • 12

1 Answers1

6

Your CPU usage is most likely due to this:

while ((m_DBResultsPerFile = m_Queue.poll()) != null)

The poll method does not block. It returns immediately. So you're executing that loop millions of times per second.

You should be using take(), which actually waits until an element becomes available:

while ((m_DBResultsPerFile = m_Queue.take()) != null)

The documentation for BlockingQueue nicely summarizes all of this, in a way that (in my opinion) eliminates any confusion.

VGR
  • 40,506
  • 4
  • 48
  • 63