In the process of converting HTML to PDF using xmlworker, I faced two problems:
Problem 1: Header tags are not styled as expected for instance, h1 tag content text font size and weight are not affected by the enclosing tag. The same thing is applicable for the other header tags (h1-h6) although they are recognized and bookmarked by PDF.
Problem 2: Image is not shown if it is wrapped inside a div tag I am trying to set the alignment attribute of the parsed image. However, when I did that manually in my ImageProvider, the alignment is not reflected in the PDF document. When I created my own TagProcessor, the image is not shown when it is inside a div. When I change the parent tag from div to p (paragraph), the image shows perfectly and the alignment work fine including textwrap. here is my code.
public class PDFCreator {
public static void main(String[] args) {
try {
PDFCreator.generatePDF();
} catch (Exception i1) {
i1.printStackTrace();
}
}
private static void generatePDF() throws DocumentException,
FileNotFoundException, BadElementException, MalformedURLException,
IOException {
OutputStream output = new FileOutputStream("V."+new SimpleDateFormat("yyyyMMdd_HHmmss").format(Calendar.getInstance().getTime())+".pdf");
// step 1
Document document = new Document(PageSize.A3, 30, 30, 60, 100);
// step 2
PdfWriter writer = PdfWriter.getInstance(document, output);
writer.setTagged();
document.open();
CSSResolver cssResolver = new StyleAttrCSSResolver();
CssFile cssFile = XMLWorkerHelper.getCSS(new ByteArrayInputStream(""
.getBytes()));
cssResolver.addCss(cssFile);
// HTML
MyHtmlPipelineContext htmlContext = new MyHtmlPipelineContext();
//htmlContext.setTagFactory(Tags.getHtmlTagProcessorFactory());
TagProcessorFactory factory = Tags.getHtmlTagProcessorFactory();
factory.removeProcessor(HTML.Tag.IMG);
factory.addProcessor(new ImageTagProcessor(), HTML.Tag.IMG);
htmlContext.setTagFactory(factory);
htmlContext.setImageProvider(new Base64ImageProvider());
// Pipelines
PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
// XML Worker
XMLWorker worker = new XMLWorker(css, true);
XMLParser p = new XMLParser(worker);
p.parse(new FileInputStream("page02.html"));
// step 5
document.close();
}
}
public class MyHtmlPipelineContext extends HtmlPipelineContext {
public MyHtmlPipelineContext() {
super(null);
}
public HtmlPipelineContext clone() {
HtmlPipelineContext ctx = null;
try {
ctx = super.clone();
ctx.setImageProvider(new Base64ImageProvider());
} catch (Exception e) {
// handle
}
return ctx;
}
}
public class ImageTagProcessor extends com.itextpdf.tool.xml.html.Image {
public List<Element> end(final WorkerContext ctx, final Tag tag, final List<Element> currentContent) {
List<Element> list = new ArrayList<Element>(1);
list.add(getImageObject(ctx, tag));
return list;
}
public static Image getImageObject(WorkerContext ctx, Tag tag) {
Map<String, String> tagAttributes = tag.getAttributes();
Map<String, String> tagCss = tag.getCSS();
Image imgObj = null;
try {
String heightAttribute;
String widthAtrribute;
String src = (String)tagAttributes.get("src");
int pos = src.indexOf("base64,");
int height = 0;
int width = 0;
if (src.startsWith("data") && pos > 0) {
byte[] img = Base64.decode((String)src.substring(pos + 7));
imgObj = Image.getInstance((byte[])img);
} else {
imgObj = Image.getInstance((String)src);
}
String floatValue = (String)tagCss.get("float");
if (floatValue != null) {
if (floatValue.equalsIgnoreCase("right")) {
imgObj.setAlignment(Image.RIGHT | Image.TEXTWRAP);
} else if (floatValue.equalsIgnoreCase("left")) {
imgObj.setAlignment(Image.LEFT | Image.TEXTWRAP);
}
}
if ((widthAtrribute = (String)tagAttributes.get("width")) != null && widthAtrribute.trim().length() > 0) {
try {
width = Integer.parseInt(widthAtrribute);
}
catch (NumberFormatException var11_13) {
// empty catch block
}
}
if ((heightAttribute = (String)tagAttributes.get("height")) != null && heightAttribute.trim().length() > 0) {
try {
height = Integer.parseInt(heightAttribute);
}
catch (NumberFormatException var12_15) {
// empty catch block
}
}
if (width > 0 && height > 0) {
imgObj.scaleAbsolute((float)width, (float)height);
}
return imgObj;
}
catch (BadElementException ex) {
return null;
}
catch (IOException ex) {
return null;
}
}
}
public class Base64ImageProvider extends AbstractImageProvider {
public Image retrieve(String src) {
int pos = src.indexOf("base64,");
try {
Image imgObj = null;
if (src.startsWith("data") && pos > 0) {
byte[] img = Base64.decode(src.substring(pos + 7));
imgObj = Image.getInstance(img);
}
else {
imgObj = Image.getInstance(src);
}
super.store(src, imgObj);
return imgObj;
} catch (BadElementException ex) {
return null;
} catch (IOException ex) {
return null;
}
}
public String getImageRootPath() {
return null;
}
}
page02.html
<html>
<body ><h1>hello</h1>
<div style="font-size: medium;">
<img align="right"
src="path"
style="width: 267px; height: 200px; float: left;" /></p>
</body>
</html>
Is there any configuration that I missed? Why the image is shown correctly if it is inside a paragraph tag but not in a div tag? Where do I have to modify the code in order to get this working? Note: the image shows correctly but with no alignment or text wrapping if I am using the default tagProcessor.