I have a requirement where in I have to parse a DOCX file and extract all the text and images. I am using OpenxmlSDK 2.5 to achieve this. I am able to parse the images and text but the DOCX also have a group of Shapes which I trying to parse and convert them to Drawing images, which is giving me wrong results.
Here is the Sample docx file which I am trying to parse.
I referred the this Stack overflow discussion and tried the same way but no luck.
The resulting DOCX which I am creating with following code is not having any parsed images.
using System.Collections.Generic;
using System.Linq;
using System.IO;
using System.Drawing;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using DocumentFormat.OpenXml.Vml;
using DocumentFormat.OpenXml;
namespace ReadGroupShape
{
class Program
{
static List<Bitmap> images = new List<Bitmap>();
static void Main(string[] args)
{
MainDocumentPart mainPart = null;
Body content = null;
WordprocessingDocument newDoc = WordprocessingDocument.Create("NewDocx.docx", WordprocessingDocumentType.Document);
MainDocumentPart newMainPart = newDoc.AddMainDocumentPart();
newMainPart.Document = new Document();
Body newbody = newMainPart.Document.AppendChild(new Body());
byte[] docBytes = File.ReadAllBytes("SampleDoc.docx");
using (MemoryStream ms = new MemoryStream())
{
ms.Write(docBytes, 0, docBytes.Length);
using (WordprocessingDocument wpDoc = WordprocessingDocument.Open(ms, true))
{
mainPart = wpDoc.MainDocumentPart;
content = mainPart.Document.Body;
foreach (Paragraph par in content.Descendants<Paragraph>())
{
Paragraph npar = newbody.AppendChild(new Paragraph());
foreach (Run run in par.Descendants<Run>())
{
Run nrun = npar.AppendChild(new Run());
DocumentFormat.OpenXml.Drawing.Blip pic = run.Descendants<DocumentFormat.OpenXml.Drawing.Blip>().FirstOrDefault();
ImageData imageData = run.Descendants<ImageData>().FirstOrDefault();
if (pic == null && imageData == null)
{
nrun.InsertAfterSelf(run.CloneNode(true));
}
else
{
if (pic != null)
{
nrun.InsertAfterSelf(CreateImageFromBlip(wpDoc, run, newMainPart, pic));
}
else if (imageData != null)
{
nrun.InsertAfterSelf(CreateImageFromShape(wpDoc, run, newMainPart, imageData));
}
}
}
}
mainPart.Document.Save();
}
}
newMainPart.Document.Save();
newDoc.Close();
}
private static Run CreateImageFromShape(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, ImageData imageData)
{
ImagePart p = sourceDoc.MainDocumentPart.GetPartById(imageData.RelationshipId) as ImagePart;
return CreateImageRun(sourceDoc, sourceRun, mainpart, p);
}
private static Run CreateImageFromBlip(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, DocumentFormat.OpenXml.Drawing.Blip blip)
{
ImagePart newPart = mainpart.AddImagePart(ImagePartType.Png);
ImagePart p = sourceDoc.MainDocumentPart.GetPartById(blip.Embed.Value) as ImagePart;
Bitmap image = new Bitmap(p.GetStream());
using (Stream s = p.GetStream())
{
s.Position = 0;
newPart.FeedData(s);
}
string partId = mainpart.GetIdOfPart(newPart);
Drawing newImage = CreateImage(partId);
return new Run(newImage);
}
private static Run CreateImageRun(WordprocessingDocument sourceDoc, Run sourceRun, MainDocumentPart mainpart, ImagePart p)
{
ImagePart newPart = mainpart.AddImagePart(ImagePartType.Png);
using (Stream s = p.GetStream())
{
s.Position = 0;
newPart.FeedData(s);
}
string partId = mainpart.GetIdOfPart(newPart);
Drawing newImage = CreateImage(partId);
return new Run(newImage);
}
private static Drawing CreateImage(string relationshipId)
{
// Define the reference of the image.
return new Drawing(
new DocumentFormat.OpenXml.Drawing.Wordprocessing.Inline(
new DocumentFormat.OpenXml.Drawing.Wordprocessing.Extent() { Cx = 990000L, Cy = 792000L },
new DocumentFormat.OpenXml.Drawing.Wordprocessing.EffectExtent()
{
LeftEdge = 0L,
TopEdge = 0L,
RightEdge = 0L,
BottomEdge = 0L
},
new DocumentFormat.OpenXml.Drawing.Wordprocessing.DocProperties()
{
Id = (UInt32Value)1U,
Name = "Picture 1"
},
new DocumentFormat.OpenXml.Drawing.Wordprocessing.NonVisualGraphicFrameDrawingProperties(
new DocumentFormat.OpenXml.Drawing.GraphicFrameLocks() { NoChangeAspect = true }),
new DocumentFormat.OpenXml.Drawing.Graphic(
new DocumentFormat.OpenXml.Drawing.GraphicData(
new DocumentFormat.OpenXml.Drawing.Picture(
new DocumentFormat.OpenXml.Drawing.NonVisualPictureProperties(
new DocumentFormat.OpenXml.Drawing.NonVisualDrawingProperties()
{
Id = (UInt32Value)0U,
Name = "New Bitmap Image.jpg"
},
new DocumentFormat.OpenXml.Drawing.NonVisualPictureDrawingProperties()),
new DocumentFormat.OpenXml.Drawing.BlipFill(
new DocumentFormat.OpenXml.Drawing.Blip(
new DocumentFormat.OpenXml.Drawing.BlipExtensionList(
new DocumentFormat.OpenXml.Drawing.BlipExtension()
{
Uri =
"{28A0092B-C50C-407E-A947-70E740481C1C}"
})
)
{
Embed = relationshipId,
CompressionState =
DocumentFormat.OpenXml.Drawing.BlipCompressionValues.Print
},
new DocumentFormat.OpenXml.Drawing.Stretch(
new DocumentFormat.OpenXml.Drawing.FillRectangle())),
new DocumentFormat.OpenXml.Drawing.ShapeProperties(
new DocumentFormat.OpenXml.Drawing.Transform2D(
new DocumentFormat.OpenXml.Drawing.Offset() { X = 0L, Y = 0L },
new DocumentFormat.OpenXml.Drawing.Extents() { Cx = 990000L, Cy = 792000L }),
new DocumentFormat.OpenXml.Drawing.PresetGeometry(
new DocumentFormat.OpenXml.Drawing.AdjustValueList()
)
{ Preset = DocumentFormat.OpenXml.Drawing.ShapeTypeValues.Rectangle }))
)
{ Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" })
)
{
DistanceFromTop = (UInt32Value)0U,
DistanceFromBottom = (UInt32Value)0U,
DistanceFromLeft = (UInt32Value)0U,
DistanceFromRight = (UInt32Value)0U,
EditId = "50D07946"
});
}
}
}
What is that I am missing? Could anyone please help me to parse the shapes and images?
Thanks.