Hello I m trying to extract data from scanned document or pdf file which contains images whith some data . For PdF file i have succeeded to read data ,but fro images or pdf that contains pictures , i failed . I have converted the image file to pdf but there is no result . This my small code :
public bool ConvertImageToPdf(string ImageIn, string PDFOut)
{
try
{
iTextSharp.text.Document doc1 = new iTextSharp.text.Document();
byte[] b = File.ReadAllBytes(ImageIn);
iTextSharp.text.Image image = iTextSharp.text.Image.GetInstance(b);
using (FileStream fs = new FileStream(PDFOut, FileMode.Create, FileAccess.Write, FileShare.None))
{
using (Document doc = new Document(image))
{
using (PdfWriter writer = PdfWriter.GetInstance(doc, fs))
{
Paragraph paragraph = new Paragraph("");
doc.Open();
image.SetAbsolutePosition(0, 0);
writer.DirectContent.AddImage(image);
doc.Add(paragraph); // add paragraph to the document
doc.Add(image); //add an image to the created pdf document
doc.Close();
return true;
}
}
}
}
catch (Exception ex)
{
return false;
}
}
And for extracting data , i use this code :
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
}
the ExtractTextFromPDFBytes(byte[] input) is similar on codeProject sample