I am using pdfclown and below code is taking around 100 seconds to highlighting search keywords in same file.Kindly provide your inputs for improving performance in below code.Please find the jar path in below url to run this code. https://drive.google.com/drive/folders/1nW8bk6bcAG6g7LZYy2YAAMk46hI9IPUh
import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;
public class pdfclown2 {
private static int count;
public static void main(String[] args) throws IOException {
highlight("book.pdf","C:\\Users\\\Downloads\\6.pdf");
System.out.println("OK");
}
private static void highlight(String inputPath, String outputPath) throws IOException {
URL url = new URL(inputPath);
InputStream in = url.openStream();
org.pdfclown.files.File file = null;
//"C:\\Users\\Desktop\\pdf\\80743064.pdf"
try {
file = new org.pdfclown.files.File("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf);
Map<String, String> m = new HashMap<String, String>();
for(int i=0;i<3500;i++){
if(i<=2){
m.put("The","hi");
m.put("know","hello");
m.put("is","Welcome");
}else{
m.put(""+i,"hi");
}
}
System.out.println("map size"+m.size());
long startTime = System.currentTimeMillis();
for (Map.Entry<String, String> entry : m.entrySet()) {
Pattern pattern;
String serachKey = entry.getKey().toLowerCase();
final String translationKeyword = entry.getValue();
if ((serachKey.contains(")") && serachKey.contains("("))
|| (serachKey.contains("(") && !serachKey.contains(")"))
|| (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
|| serachKey.contains("*") || serachKey.contains("+")) {
pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
}
else
pattern = Pattern.compile( "\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);
// 2. Iterating through the document pages...
TextExtractor textExtractor = new TextExtractor(true, true);
for (final Page page : file.getDocument().getPages()) {
// 2.1. Extract the page text!
Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
//System.out.println(textStrings.toString().indexOf(entry.getKey()));
// 2.2. Find the text pattern matches!
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
// 2.3. Highlight the text pattern matches!
textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
public boolean hasNext() {
// System.out.println(matcher.find());
// if(key.getMatchCriteria() == 1){
if (matcher.find()) {
return true;
}
/*
* } else if(key.getMatchCriteria() == 2) { if
* (matcher.hitEnd()) { count++; return true; } }
*/
return false;
}
public Interval<Integer> next() {
return new Interval<Integer>(matcher.start(), matcher.end());
}
public void process(Interval<Integer> interval, ITextString match) {
// Defining the highlight box of the text pattern
// match...
System.out.println(match);
List<Quad> highlightQuads = new ArrayList<Quad>();
{
Rectangle2D textBox = null;
for (TextChar textChar : match.getTextChars()) {
Rectangle2D textCharBox = textChar.getBox();
if (textBox == null) {
textBox = (Rectangle2D) textCharBox.clone();
} else {
if (textCharBox.getY() > textBox.getMaxY()) {
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D) textCharBox.clone();
} else {
textBox.add(textCharBox);
}
}
}
textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
highlightQuads.add(Quad.get(textBox));
}
new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);
}
public void remove() {
throw new UnsupportedOperationException();
}
});
}
}
SerializationModeEnum serializationMode = SerializationModeEnum.Incremental;
file.save(new java.io.File(outputPath), serializationMode);
System.out.println("file created");
long endTime = System.currentTimeMillis();
System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);
} catch (Exception e) {
e.printStackTrace();
}
finally{
in.close();
}
}
}