How to improve performance for search keywords highlighting on file using pdfclown

Question

I am using pdfclown and below code is taking around 100 seconds to highlighting search keywords in same file.Kindly provide your inputs for improving performance in below code.Please find the jar path in below url to run this code. https://drive.google.com/drive/folders/1nW8bk6bcAG6g7LZYy2YAAMk46hI9IPUh

import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;

public class pdfclown2 {
    private static int count;

    public static void main(String[] args) throws IOException {

        highlight("book.pdf","C:\\Users\\\Downloads\\6.pdf");
        System.out.println("OK");
    }
    private static void highlight(String inputPath, String outputPath) throws IOException {

        URL url = new URL(inputPath);
        InputStream in = url.openStream();
        org.pdfclown.files.File file = null;
        //"C:\\Users\\Desktop\\pdf\\80743064.pdf"
        try {
            file = new org.pdfclown.files.File("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf);

        Map<String, String> m = new HashMap<String, String>();
    for(int i=0;i<3500;i++){

        if(i<=2){
        m.put("The","hi");
        m.put("know","hello");
        m.put("is","Welcome");
        }else{
            m.put(""+i,"hi");
        }
    }

        System.out.println("map size"+m.size());
         long startTime = System.currentTimeMillis();

        for (Map.Entry<String, String> entry : m.entrySet()) {

            Pattern pattern;
            String serachKey =  entry.getKey().toLowerCase();
            final String translationKeyword = entry.getValue();

                if ((serachKey.contains(")") && serachKey.contains("("))
                        || (serachKey.contains("(") && !serachKey.contains(")"))
                        || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                        || serachKey.contains("*") || serachKey.contains("+")) {
                    pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                }
                else
                     pattern = Pattern.compile( "\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);


            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            for (final Page page : file.getDocument().getPages()) {
                // 2.1. Extract the page text!
                Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
            //System.out.println(textStrings.toString().indexOf(entry.getKey()));

                // 2.2. Find the text pattern matches!
                final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
                // 2.3. Highlight the text pattern matches!
                textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
                    public boolean hasNext() {
                        // System.out.println(matcher.find());
                        // if(key.getMatchCriteria() == 1){
                        if (matcher.find()) {
                            return true;
                        }
                        /*
                         * } else if(key.getMatchCriteria() == 2) { if
                         * (matcher.hitEnd()) { count++; return true; } }
                         */
                        return false;

                    }

                    public Interval<Integer> next() {
                        return new Interval<Integer>(matcher.start(), matcher.end());
                    }

                    public void process(Interval<Integer> interval, ITextString match) {
                        // Defining the highlight box of the text pattern
                        // match...
                        System.out.println(match);
                        List<Quad> highlightQuads = new ArrayList<Quad>();
                        {
                            Rectangle2D textBox = null;
                            for (TextChar textChar : match.getTextChars()) {
                                Rectangle2D textCharBox = textChar.getBox();
                                if (textBox == null) {
                                    textBox = (Rectangle2D) textCharBox.clone();
                                } else {
                                    if (textCharBox.getY() > textBox.getMaxY()) {
                                        highlightQuads.add(Quad.get(textBox));
                                        textBox = (Rectangle2D) textCharBox.clone();
                                    } else {
                                        textBox.add(textCharBox);
                                    }
                                }
                            }
                            textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                            highlightQuads.add(Quad.get(textBox));
                        }

                        new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

                    }

                    public void remove() {
                        throw new UnsupportedOperationException();
                    }

                });
            }

        }

        SerializationModeEnum serializationMode = SerializationModeEnum.Incremental;

            file.save(new java.io.File(outputPath), serializationMode);

            System.out.println("file created");
            long endTime = System.currentTimeMillis();

             System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

        } catch (Exception e) {
               e.printStackTrace();
        }
        finally{
            in.close();
        }


    }
}

You might want to share the PDF in question for analysis. The proposals in @Joop's answer make sense if his guess *that `process` is the bottle neck* is correct. It may be wrong, though, for example the pure parsing of the page content streams also might take a long time and in case of a small number of matches overall `process` hardly is of interest at all. — mkl, Feb 21 '18 at 10:16

Joop Eggen · Accepted Answer · 2018-02-21T11:15:22.017

My guess is that process is the bottle neck, which can be easily tested (comment the code out). Measure times. A good time for profiling the application.

A simple heuristic optimisation: taking the first and last TextChar rectangles for one liners, and considering font ascenders and descenders, one could create ab entire rectangle. That would already speed things up.

Alternatives probably exist. Place a more specific question.

Further improvements:

    InputStream in = url.openStream();

should be

    InputStream in = new BufferedInputStream(url.openStream());

And the multiply searchKey.contains might possibly be a Pattern declared before the loop.

The same technique might be done for the original highlighting code, but then multi-line support should be added, a Quad for every line.

The textExtractor is reused for every page which seems the fastest way, but try declare it in the page loop.

I hope you get a more concrete answer, though I doubt it, hence this one. Better would have been to isolate the slow code from the entirety. But I understand the wish for overall performance gain.

A less precise, maybe faster highlight code:

                    List<TextChar> textChars = match.getTextChars();
                    Rectangle2D firstRect = textChars.get(0).getBox();
                    Rectangle2D lastRect = textChars.get(textChars.size() - 1).getBox();
                    Rectangle2D rect = firstRect.createUnion(lastRect);
                    highlightQuads.add(Quad.get(rect));

After other comment

It seems that the bottle neck lies elsewhere. My guess is the text extraction then: so invert the two loops:

TextExtractor textExtractor = new TextExtractor(true, true);
for (final Page page : file.getDocument().getPages()) {

    for (Map.Entry<String, String> entry : m.entrySet()) {
        Pattern pattern;
        String serachKey =  entry.getKey().toLowerCase();
        final String translationKeyword = entry.getValue();

        if ((serachKey.contains(")") && serachKey.contains("("))
                    || (serachKey.contains("(") && !serachKey.contains(")"))
                    || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                    || serachKey.contains("*") || serachKey.contains("+")) {
                pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
        }
        else
             pattern = Pattern.compile( "\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);

It probably makes sense to have a map of Pattern as Pattern.compile is slow.

And then I am out of ideas / have other things to do.

Thanks for your reply , I made the changes as per your previous answer.But still performance is slow.In my case map size is 3500 so that it is taking 127 seconds to create this file and before code changes it tooks 130 seconds to create this file.Kindly provide your inputs for to improve the performance — Seshadri, Feb 21 '18 at 10:57
*"My guess is the text extraction then"* - quite reasonable, in particular after the OP increased the loop count for the creation of `m` from `35` to `3500` causing the text extraction to be executed more than 10000 times while it needed only be done once... The `textExtractor.extract(page)` call merely must not be within the loop over the entries of `m`. — mkl, Feb 21 '18 at 11:24
@mkl indeed, a good profiling is worth much in such mix of APIs. Thanks for the positive echo. — Joop Eggen, Feb 21 '18 at 11:28

How to improve performance for search keywords highlighting on file using pdfclown

1 Answers1