I got the requirement in pdfclown like if there are few keywords which are substring/matched with another keyword, while highlighting those keywords has to be override and should allow to highlight full keyword .For example in below map ETS keyword is substring of just.ETS and Test.ETS keywords. And Expected result should be like We need to highlight full keyword like just.ETS , Test.ETS instead of ETS keyword and their popup measure value. .ActualPdf and actual result pdf. and jar path.
Map<String, String> m = new HashMap<String, String>();
map.put("ETS" , "Loss");
map.put("Just. ETS" , "Net ");
map.put("Test. ETS" , "Profit");
(Note:1. If large size keyword is already highlighted in file then small size keyword which are matched with large keyword should not allow to highlight 2. If small size keyword is already highlighted and this keyword matched with large keyword then large keyword should higlight and ignore/unhighlight the small keyword.).
import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;
public class pdfclown2 {
private static int count;
public static void main(String[] args) throws IOException {
highlight("C:\\Users\\uc23\\Desktop\\pdf\\80743064.pdf","C:\\Users\\\Downloads\\6.pdf");
System.out.println("OK");
}
private static void highlight(String inputPath, String outputPath) throws IOException {
org.pdfclown.files.File file = null;
try {
file = new org.pdfclown.files.File("C:\\Users\\uc239646\\Desktop\\test.pdf");
List<Keyword> l=new ArrayList<Keyword>();
Keyword k=new Keyword();
Keyword k1=new Keyword();
k1.setKey("Just. ETS");
k1.setValue("NET");
l.add(k1);
Keyword k2=new Keyword();
k2.setKey("Test. ETS");
k2.setValue("PROFIT");
l.add(k2);
k.setKey("ETS");
k.setValue("LOSS");
l.add(k);
long startTime = System.currentTimeMillis();
// 2. Iterating through the document pages...
TextExtractor textExtractor = new TextExtractor(true, true);
for (final Page page : file.getDocument().getPages()) {
Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
for (Keyword e : l) {
Pattern pattern;
String serachKey = e.getKey();
final String translationKeyword = e.getValue();
if ((serachKey.contains(")") && serachKey.contains("("))
|| (serachKey.contains("(") && !serachKey.contains(")"))
|| (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
|| serachKey.contains("*") || serachKey.contains("+")) {
pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
}
else
pattern = Pattern.compile("\\b"+serachKey+"\\b", Pattern.CASE_INSENSITIVE);
// 2.1. Extract the page text!
//System.out.println(textStrings.toString().indexOf(entry.getKey()));
// 2.2. Find the text pattern matches!
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings).toLowerCase());
// 2.3. Highlight the text pattern matches!
//System.out.println(textStrings);
textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
public boolean hasNext() {
// if(key.getMatchCriteria() == 1){
if (matcher.find()) {
return true;
}
/*
* } else if(key.getMatchCriteria() == 2) { if
*
*
*
*
*
*
*
*
* (matcher.hitEnd()) { count++; return true; } }
*/
return false;
}
public Interval<Integer> next() {
return new Interval<Integer>(matcher.start(), matcher.end());
}
public void process(Interval<Integer> interval, ITextString match) {
System.out.println(match);
// Defining the highlight box of the text pattern
// match...
/*List l=new ArrayList();
if(!l.contains(match)){
System.out.println("map.put("+match+","+translationKeyword+")");
}
*/
List<Quad> highlightQuads = new ArrayList<Quad>();
{
Rectangle2D textBox = null;
for (TextChar textChar : match.getTextChars()) {
Rectangle2D textCharBox = textChar.getBox();
if (textBox == null) {
textBox = (Rectangle2D) textCharBox.clone();
} else {
if (textCharBox.getY() > textBox.getMaxY()) {
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D) textCharBox.clone();
} else {
textBox.add(textCharBox);
}
}
System.out.println(highlightQuads.contains(textBox));
textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
highlightQuads.add(Quad.get(textBox));
}
/* List<Quad> highlightQuads = new ArrayList<Quad>();
List<TextChar> textChars = match.getTextChars();
Rectangle2D firstRect = textChars.get(0).getBox();
Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
Rectangle2D rect = firstRect.createUnion(lastRect);
highlightQuads.add(Quad.get(rect));*/
// subtype can be Highlight, Underline, StrikeOut, Squiggly
new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);
}
}
public void remove() {
throw new UnsupportedOperationException();
}
});
}
}
SerializationModeEnum serializationMode = SerializationModeEnum.Standard;
file.save(new java.io.File(outputPath), serializationMode);
System.out.println("file created");
long endTime = System.currentTimeMillis();
System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);
} catch (Exception e) {
e.printStackTrace();
}
}
}