I have a PDF file with colored text that I need to remove. I couldn't find much help anywhere so I dug in and figured it out with the help of this post: PDFBox 2.0 RC3 -- Find and replace text
As I there isn't much about this I suspect that few people care, still, thought I'd share.
private void setTextBlack(PDDocument pdDocument) throws IOException {
for ( PDPage pdPage: pdDocument.getPages()) {
PDFStreamParser parser = new PDFStreamParser(pdPage);
parser.parse();
java.util.List tokens = parser.getTokens();
for ( int i=0; i<tokens.size(); i++ ) {
Object next = tokens.get(i);
if ( next instanceof Operator && ((Operator) next).getName().equals("BT") ) {
for ( int j=i+1; j< tokens.size(); j++ ) {
Object btToken = tokens.get(j);
if ( btToken instanceof Operator && ((Operator) btToken).getName().equals("rg") ) {
int n = j - 1;
while (tokens.get(n) instanceof COSInteger || tokens.get(n) instanceof COSFloat) {
tokens.set(n, new COSFloat(0f));
n--;
}
}
if ( btToken instanceof Operator && ((Operator) btToken).getName().equals("ET")) {
break;
}
}
}
}
PDStream updatedStream = new PDStream(pdDocument);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
pdPage.setContents(updatedStream);
out.close();
}
}