pdf clown- not highlighting specific search keyword

Question

I am using pdf-clown with pdfclown-0.2.0-HEAD.jar.I have written below code for highlighting search the keyword in Chinese language pdf file and same code is working fine with english pdf file.

import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.BufferedInputStream;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;

public class pdfclown2 {
    private static int count;

    public static void main(String[] args) throws IOException {

        highlight("ebook.pdf","C:\\Users\\Downloads\\6.pdf");
        System.out.println("OK");
    }
    private static void highlight(String inputPath, String outputPath) throws IOException {

        URL url = new URL(inputPath);
        InputStream in = new BufferedInputStream(url.openStream());
        org.pdfclown.files.File file = null;

        try {
            file = new org.pdfclown.files.File("C:\\Users\\Desktop\\pdf\\test123.pdf");

        Map<String, String> m = new HashMap<String, String>();
            m.put("亿元或","hi");
            m.put("收入亿来","hi");



        System.out.println("map size"+m.size());
         long startTime = System.currentTimeMillis();




            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            for (final Page page : file.getDocument().getPages()) {
                Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
                for (Map.Entry<String, String> entry : m.entrySet()) {

                    Pattern pattern;
                    String serachKey =  entry.getKey();
                    final String translationKeyword = entry.getValue();
                /*
                        if ((serachKey.contains(")") && serachKey.contains("("))
                                || (serachKey.contains("(") && !serachKey.contains(")"))
                                || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                                || serachKey.contains("*") || serachKey.contains("+")) {s
                            pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                        }
                        else*/
                             pattern = Pattern.compile(serachKey, Pattern.CASE_INSENSITIVE);
                // 2.1. Extract the page text!

            //System.out.println(textStrings.toString().indexOf(entry.getKey()));

                // 2.2. Find the text pattern matches!
                final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
                // 2.3. Highlight the text pattern matches!
                textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
                    public boolean hasNext() {
                        // System.out.println(matcher.find());
                        // if(key.getMatchCriteria() == 1){
                        if (matcher.find()) {
                            return true;
                        }
                        /*
                         * } else if(key.getMatchCriteria() == 2) { if
                         * (matcher.hitEnd()) { count++; return true; } }
                         */
                        return false;

                    }

                    public Interval<Integer> next() {
                        return new Interval<Integer>(matcher.start(), matcher.end());
                    }

                    public void process(Interval<Integer> interval, ITextString match) {
                        // Defining the highlight box of the text pattern
                        // match...
                        System.out.println(match);
                    /*  List<Quad> highlightQuads = new ArrayList<Quad>();
                        {
                            Rectangle2D textBox = null;
                            for (TextChar textChar : match.getTextChars()) {
                                Rectangle2D textCharBox = textChar.getBox();
                                if (textBox == null) {
                                    textBox = (Rectangle2D) textCharBox.clone();
                                } else {
                                    if (textCharBox.getY() > textBox.getMaxY()) {
                                        highlightQuads.add(Quad.get(textBox));
                                        textBox = (Rectangle2D) textCharBox.clone();
                                    } else {
                                        textBox.add(textCharBox);
                                    }
                                }
                            }
                            textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                            highlightQuads.add(Quad.get(textBox));
                        }*/
                        List<Quad> highlightQuads = new ArrayList<Quad>();
                        List<TextChar> textChars = match.getTextChars();
                        Rectangle2D firstRect = textChars.get(0).getBox();
                        Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
                        Rectangle2D rect = firstRect.createUnion(lastRect);
                        highlightQuads.add(Quad.get(rect).get(rect));
                        // subtype can be Highlight, Underline, StrikeOut, Squiggly


                        new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

                    }

                    public void remove() {
                        throw new UnsupportedOperationException();
                    }

                });
            }

        }

        SerializationModeEnum serializationMode = SerializationModeEnum.Standard;

            file.save(new java.io.File(outputPath), serializationMode);

            System.out.println("file created");
            long endTime = System.currentTimeMillis();

             System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

        } catch (Exception e) {
               e.printStackTrace();
        }
        finally{
            in.close();
        }


    }
}

Kindly provide your inputs to highlight specific search keyword for non english pdf files.

I am serching the keyword in below text which is in chinese langauage.

普双套习近平修宪普京利用双套车绕开宪法装班要走普京

enter image description here

@mkl Please have the following chinese characters and convert into pdf file and use the same file. **普京“双套车”、近平修宪和杜特尔特跟进近平修宪前，俄罗斯总统普京利用“双套车”绕开宪法延长政治权力，现在菲律宾总统跃跃欲试** — seshadri p, Jan 23 '18 at 11:49
The issue might be due to the way your PDF has been created. Then what would it help if I created a PDF from your text and reported "cannot reproduce your problem"? — mkl, Jan 23 '18 at 11:51
@mkl i have sent the pdf to your email id for testing purpose.Kindly provide your inputs to resolve this issue. — seshadri p, Jan 31 '18 at 07:30
I just tested your code against your document. The result here differs, though: [screenshot](https://i.stack.imgur.com/taGqy.png). So it is still wrong but wrong in a different way. Have you probably not send the original PDF but instead re-created it? Or is your pdfclown-0.2.0-HEAD.jar not built from the "current" trunk state of the repository TRUNK? — mkl, Feb 02 '18 at 09:41
I have taken jar from below url which is built properly. I used same jar and I tried with english pdf document which is working fine.Let me know which version of jar will work for non english pdf documents.Please send me the link of that jar.Thanks in advance. https://github.com/tymate/mavenrepo/tree/master/org/pdfclown/pdfclown/0.2.0-HEAD — seshadri p, Feb 02 '18 at 10:26
The original PdfClown code is hosted on source forge. The current 0.2.0 development version, therefore, must be compiled from the svn TRUNK there. As mentioned above, though, there are issues, too, in that version, on the screen shot you'll see that not all marked areas are at the correct position. — mkl, Feb 02 '18 at 12:26
@mkl As you said i made the changes in showtext.java.But still i am facing different issue like some where text is highlighting.Please share your code with modified jar to my Emailid then only i can able to test. — seshadri p, Feb 07 '18 at 07:24
gmail does not appear to accept jar attachments. You can download my current PDF Clown jars [here](https://drive.google.com/drive/folders/1nW8bk6bcAG6g7LZYy2YAAMk46hI9IPUh?usp=sharing). It is compiled from the develop branch of [my copy](https://github.com/mkl-public/pdfclown) of the source forge SVN on github. There also are two other fixes/work-arounds in it. — mkl, Feb 07 '18 at 08:26
*I am having another issue like i need to read from below url and highlight search keyword text then i need to show the pdf with this url in pop window using javascript ex* - Please make it a question in its own right. And probably even split it up into three parts, "How to read a PDF from an URL into PDF Clown", "How to return a PDF from PDF Clown in a web request to the user", and "How to display a PDF retrieved via web request in a browser pop up". I in particular might help with the former two parts but have no idea about the final one, I've merely read that that might be problematic. — mkl, Feb 09 '18 at 07:56

score 0 · Accepted Answer · edited Jun 20 '20 at 09:12

Your PDF Clown version

The PDF Clown version you retrieved here from Tymate's maven repository on github has been pushed there April 23rd, 2015. The final (as of now) check-in to the PDF Clown subversion source code repository TRUNK on sourceforge, on the other hand, is from May 27th, 2015. There actually are some 30 checkins after April 23rd, 2015. Thus, you definitely do not use the most current version of this apparently dead PDF library project.

Using the current 0.2.0 snapshot

I tested your code with the 0.2.0 development version compiled from that trunk and the result indeed is different:

screenshot still somewhat buggy

It is better insofar as the highlights have the width of the sought character and are located nearer to the actual character position. There still is a bug, though, as the second and third match highlights are somewhat off.

Fixing the bug

The remaining problem actually is not related to the language of the text. It simply is a bug in the processing of one type of the PDF text drawing commands, so it can be observed in documents with text in arbitrary languages. Due to the fact that these commands nowadays are used very seldom only, though, the bug is hardly ever observed, let alone reported. Your PDF, on the other hand, makes use of that kind of text drawing commands.

The bug is in the ShowText class (package org.pdfclown.documents.contents.objects). At the end of the scan method the text line matrix in the graphics state is updated like this if the ShowText instance actually is a ShowTextToNextLine instance derived from it:

if(textScanner == null)
{
  state.setTm(tm);

  if(this instanceof ShowTextToNextLine)
  {state.setTlm((AffineTransform)tm.clone());}
}

The text line matrix here is set to the text matrix after the move to the next line and the drawing of the text. This is wrong, it must instead be set to text matrix right after the move to the next line before the drawing of the text.

This can be fixed e.g. like this:

if(textScanner == null)
{
  state.setTm(tm);

  if(this instanceof ShowTextToNextLine)
    state.getTlm().concatenate(new AffineTransform(1, 0, 0, 1, 0, -state.getLead()));
}

With this change in place the result looks like this:

Still iam facing issue with some of the search keywords are not highlighting in chinese documents .Due to confidiential concerns iam not providing actual pdf . search keywords are 1)亿元或2) 收入亿来源 Please find the pdf document path which i tested , original pdfpath [link](https://drive.google.com/file/d/1i4MApZA8HrxlU7n6Hjb9u5HsU1vaEOJw/view). and ActualResult [link](https://drive.google.com/file/d/194vYKDpKa5wZslUsKiFSeXjO2z3TvZOc/view) — seshadri p, Feb 23 '18 at 12:00
@seshadrip This sample highlighted yet another error of PDF Clown. I extended the answer with a fix of that. But please create new questions for new issues. As you can see in this case, the issue you presented in the original question and the issue you presented in the comment here are caused by completely different PDF Clown bugs and, therefore, deserve their own question each. EDIT: Oh, I see you ***did*** create such a new question eventually. I'm going to answer it as above. — mkl, Feb 27 '18 at 07:53
Ok, I moved the solution of the new issue to your new question as https://stackoverflow.com/a/49003958/1729265 — mkl, Feb 27 '18 at 08:12

pdf clown- not highlighting specific search keyword

1 Answers1

Your PDF Clown version

Using the current 0.2.0 snapshot

Fixing the bug

Linked