4

I am trying to get sizes (width and depth) of images embedded in a PDF file. The images in the PDF are all high resolution vector images.

  • I tried using PDFBox. PDFBox libraries extract images perfectly for normal graphics. But, when it gets vector images, it extracts different layers as different images.
  • I have also read about iText. But iText can convert the whole page as rasterized image. Whereas, my PDF page is actually consisting multiple images and I need to extract/get size of all of them differently.

I am attaching my PDFBox image extraction code here. Please let me know, how can I get one vectored image as one image and not as layers.

My code is as follows:

package com.abp.pdf.util;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

public class ExtractImages {

    private int imageCounter = 1;


    private ExtractImages() {
    }
    public static void main(String[] args) throws Exception {
        ExtractImages extractor = new ExtractImages();
        extractor.extractImages(args);
    }

    private void extractImages(String[] args) throws Exception {
        String pdfFile = null;
        String password = "";
        String prefix = null;
        boolean addKey = false;
        boolean useNonSeqParser = true;

        pdfFile = "/home/suvankar/Resources/myfile.pdf";
        if (prefix == null && pdfFile.length() > 4) {
            prefix = pdfFile.substring(0, pdfFile.lastIndexOf("/") + 1)
                    + "extracted/images"
                    + pdfFile.substring(pdfFile.lastIndexOf("/"),
                            pdfFile.length() - 4);
        }
        PDDocument document = null;

        try {
            if (useNonSeqParser) {
                document = PDDocument.loadNonSeq(new File(pdfFile), null,
                        password);
            } else {
                document = PDDocument.load(pdfFile);

                if (document.isEncrypted()) {
                    StandardDecryptionMaterial spm = new StandardDecryptionMaterial(
                            password);
                    document.openProtection(spm);
                }
            }
            AccessPermission ap = document.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                throw new IOException(
                        "Error: You do not have permission to extract images.");
            }

            List pages = document.getDocumentCatalog().getAllPages();
            Iterator iter = pages.iterator();
            while (iter.hasNext()) {
                PDPage page = (PDPage) iter.next();
                PDResources resources = page.getResources();
                processResources(resources, prefix, addKey);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }


    private void processResources(PDResources resources, String prefix,
            boolean addKey) throws IOException {
        if (resources == null) {
            return;
        }
        Map<String, PDXObject> xobjects = resources.getXObjects();
        if (xobjects != null) {
            Iterator<String> xobjectIter = xobjects.keySet().iterator();
            while (xobjectIter.hasNext()) {
                String key = xobjectIter.next();
                PDXObject xobject = xobjects.get(key);
                // write the images
                if (xobject instanceof PDXObjectImage) {
                    PDXObjectImage image = (PDXObjectImage) xobject;
                    String name = null;

                    if (addKey) {
                        name = getUniqueFileName(prefix + "_" + key,
                                image.getSuffix());
                    } else {
                        name = getUniqueFileName(prefix, image.getSuffix());
                    }
                    System.out.println("Writing image:" + name + "\nHeight - "+ image.getHeight() + "\nWidth - " + image.getWidth());
                    // name="extracted/images/" + name;
                    /*BufferedImage ib= image.getRGBImage();
                    File outputfile = new File(name + "-buffered.jpg");
                    ImageIO.write(ib, "jpeg", outputfile);*/ 
                    image.write2file(name);
                }
                // maybe there are more images embedded in a form object
                else if (xobject instanceof PDXObjectForm) {
                    PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
                    PDResources formResources = xObjectForm.getResources();
                    processResources(formResources, prefix, addKey);
                }
            }
        }
    }

    private String getUniqueFileName(String prefix, String suffix) {
        String uniqueName = null;
        File f = null;
        while (f == null || f.exists()) {
            uniqueName = prefix + "-" + imageCounter;
            f = new File(uniqueName + "." + suffix);
            imageCounter++;
        }
        return uniqueName;
    }

    /**
     * This will print the usage requirements and exit.
     */
    private static void usage() {
        System.err
                .println("Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n"
                        + "  -password  <password>        Password to decrypt document\n"
                        + "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n"
                        + "  -addkey                      add the internal image key to the file name\n"
                        + "  -nonSeq                      Enables the new non-sequential parser\n"
                        + "  <PDF file>                   The PDF document to use\n");
        System.exit(1);
    }

}
James A Mohler
  • 11,060
  • 15
  • 46
  • 72
cybersuv
  • 41
  • 3
  • The problem when trying to extract vector graphic images from PDF is that there is no clear way to recognize where different vector graphic images on the same page start and end as the path operators on a page are not necessarily are sorted by image and there us no marker indicating start or end of an image. Furthermore you cannot tell whether some text is part of such an image or part of the general page text. – mkl Feb 13 '13 at 10:14

0 Answers0