I've followed this answer and wrote program,In pdf I'm trying to replace word datalog with ddddddd.All the occurrence got successfully replaced.But the problem is that at some places where "- " is present got replaced by illegal character Å’.The word datalog is in page no 3,5.But the i got this illegal character at page no 4.I want to know why did i get that character.any help would be highly appreciated.
import java.io.*;
import java.util.*;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
public class SimpleReplace {
public static void main (String[] args) throws Exception {
PDDocument document = null;
String fileName ="";
try {
document = PDDocument.load( new File(fileName),"" );
document.setAllSecurityToBeRemoved(true);
String outputFileName = "SimpleReplace.pdf";
// the encoding will need to be adapted to your circumstances
//String encoding = "ISO-8859-1";
String encoding = "ISO-8859-1";
// Note that search and replace can be regular expressions
// replace all occurrences of 'Hello'
searchReplace(" ", "Aaaa Aaaaa Aaa", encoding,true, document);
// replace only first occurrence of 'World'
// Save the results and ensure that the document is properly closed
document.save(outputFileName);
}
finally {
if( document != null ) {
document.close();
}
}
}
private static void searchReplace (String search, String replace,String encoding ,boolean replaceAll, PDDocument doc) throws IOException {
PDPageTree pages = doc.getDocumentCatalog().getPages();
for (PDPage page : pages) {
int count=0;
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
// Tj and TJ are the two operators that display strings in a PDF
// Tj takes one operator and that is the string to display so lets update that operator
if (op.getName().equals("Tj")) {
COSString previous = (COSString) tokens.get(j-1);
String string = previous.getString();
if (replaceAll) {
string = string.replaceAll(search, replace);
}
else
string = string.replaceFirst(search, replace);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j-1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
if (replaceAll)
string = string.replaceAll(search, replace);
else
string = string.replaceFirst(search, replace);
cosString.setValue(string.getBytes());
}
}
}
}
}
// now that the tokens are updated we will replace the page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
}
}