I have tried to use xpdf source code into a MFC application to convert pdf to text. The code sample is taken from their site (or repository):
int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
GString* ownerPW, *userPW;
UnicodeMap* uMap;
TextOutputDev* textOut;
TextOutputControl textOutControl;
GString* textFileName;
int exitCode;
char textEncName[128] = "";
char textEOL[16] = "";
GBool noPageBreaks = gFalse;
GBool quiet = gFalse;
char ownerPassword[33] = "\001";
char userPassword[33] = "\001";
int firstPage = 1;
int lastPage = 0;
GBool tableLayout = gFalse;
double fixedPitch = 0;
GBool physLayout = gFalse;
GBool simpleLayout = gFalse;
GBool simple2Layout = gFalse;
GBool linePrinter = gFalse;
GBool rawOrder = gFalse;
double fixedLineSpacing = 0;
double marginLeft = 0;
double marginRight = 0;
double marginTop = 0;
double marginBottom = 0;
GBool clipText = gFalse;
GBool discardDiag = gFalse;
GBool insertBOM = gFalse;
exitCode = 99;
// read config file
globalParams = new GlobalParams("");
if (textEncName[0])
{
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0])
{
if (!globalParams->setTextEOL(textEOL))
{
fprintf(stderr, "Bad '-eol' value on command line\n");
}
}
if (noPageBreaks)
{
globalParams->setTextPageBreaks(gFalse);
}
if (quiet)
{
globalParams->setErrQuiet(quiet);
}
// Set UNICODE support
globalParams->setTextEncoding("UTF-8");
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding()))
{
error(errConfig, -1, "Couldn't get text encoding");
goto err1;
}
// open PDF file
if (ownerPassword[0] != '\001')
{
ownerPW = new GString(ownerPassword);
}
else
{
ownerPW = NULL;
}
if (userPassword[0] != '\001')
{
userPW = new GString(userPassword);
}
else
{
userPW = NULL;
}
PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
if (userPW)
{
delete userPW;
}
if (ownerPW)
{
delete ownerPW;
}
if (! doc->isOk())
{
exitCode = 1;
goto err2;
}
// check for copy permission
if (! doc->okToCopy())
{
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
// construct text file name
textFileName = new GString(TxtFile.c_str());
// get page range
if (firstPage < 1)
{
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages())
{
lastPage = doc->getNumPages();
}
// write text file
if (tableLayout)
{
textOutControl.mode = textOutTableLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (physLayout)
{
textOutControl.mode = textOutPhysLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (simpleLayout)
{
textOutControl.mode = textOutSimpleLayout;
}
else if (simple2Layout)
{
textOutControl.mode = textOutSimple2Layout;
}
else if (linePrinter)
{
textOutControl.mode = textOutLinePrinter;
textOutControl.fixedPitch = fixedPitch;
textOutControl.fixedLineSpacing = fixedLineSpacing;
}
else if (rawOrder)
{
textOutControl.mode = textOutRawOrder;
}
else
{
textOutControl.mode = textOutReadingOrder;
}
textOutControl.clipText = clipText;
textOutControl.discardDiagonalText = discardDiag;
textOutControl.insertBOM = insertBOM;
textOutControl.marginLeft = marginLeft;
textOutControl.marginRight = marginRight;
textOutControl.marginTop = marginTop;
textOutControl.marginBottom = marginBottom;
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
if (textOut->isOk())
{
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
}
else
{
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
// uMap->decRefCnt();
err1:
delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
So far, so good. But this code isn't thread safe: if I am trying run this code inside a multi-threading code, it crashes:
// TextOutputDev.cc
if (uMap->isUnicode())
{
lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); // <-- crash
Why ? Because there is a variable, globalParams, which is deleted in the last lines of the function, and it's common for all threads:
delete globalParams;
And globalParams it's an extern global variable from GlobalParams.h (part of xpdf code):
// xpdf/GlobalParams.h
// The global parameters object.
extern GlobalParams *globalParams;
How can I do this function thread safe ? Because the "problem variable" it's inside xpdf source code, not in mine ...
P.S. To sum up things, globalParams it's declared in xpdf code, and it's cleaned in my (client) code.
The xpdf source code could be seen here: https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.h
and