0

I have tried to use xpdf source code into a MFC application to convert pdf to text. The code sample is taken from their site (or repository):

int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
    GString* ownerPW, *userPW;
    UnicodeMap* uMap;
    TextOutputDev* textOut;
    TextOutputControl textOutControl;
    GString* textFileName;

    int exitCode;
    char textEncName[128] = "";
    char textEOL[16] = "";
    GBool noPageBreaks = gFalse;
    GBool quiet = gFalse;
    char ownerPassword[33] = "\001";
    char userPassword[33] = "\001";
    int firstPage = 1;
    int lastPage = 0;
    GBool tableLayout = gFalse;
    double fixedPitch = 0;
    GBool physLayout = gFalse;
    GBool simpleLayout = gFalse;
    GBool simple2Layout = gFalse;
    GBool linePrinter = gFalse;
    GBool rawOrder = gFalse;
    double fixedLineSpacing = 0;
    double marginLeft = 0;
    double marginRight = 0;
    double marginTop = 0;
    double marginBottom = 0;
    GBool clipText = gFalse;
    GBool discardDiag = gFalse;
    GBool insertBOM = gFalse;

    exitCode = 99;

    // read config file
    globalParams = new GlobalParams("");
    if (textEncName[0])
    {
        globalParams->setTextEncoding(textEncName);
    }
    if (textEOL[0])
    {
        if (!globalParams->setTextEOL(textEOL))
        {
            fprintf(stderr, "Bad '-eol' value on command line\n");
        }
    }
    if (noPageBreaks)
    {
        globalParams->setTextPageBreaks(gFalse);
    }
    if (quiet)
    {
        globalParams->setErrQuiet(quiet);
    }

    // Set UNICODE support
    globalParams->setTextEncoding("UTF-8");

    // get mapping to output encoding
    if (!(uMap = globalParams->getTextEncoding()))
    {
        error(errConfig, -1, "Couldn't get text encoding");
        goto err1;
    }

    // open PDF file
    if (ownerPassword[0] != '\001')
    {
        ownerPW = new GString(ownerPassword);
    }
    else
    {
        ownerPW = NULL;
    }
    if (userPassword[0] != '\001')
    {
        userPW = new GString(userPassword);
    }
    else
    {
        userPW = NULL;
    }
    PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
    if (userPW)
    {
        delete userPW;
    }
    if (ownerPW)
    {
        delete ownerPW;
    }
    if (! doc->isOk())
    {
        exitCode = 1;
        goto err2;
    }

    // check for copy permission
    if (! doc->okToCopy())
    {
        error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
        exitCode = 3;
        goto err2;
    }

    // construct text file name
    textFileName = new GString(TxtFile.c_str());

    // get page range
    if (firstPage < 1)
    {
        firstPage = 1;
    }
    if (lastPage < 1 || lastPage > doc->getNumPages())
    {
        lastPage = doc->getNumPages();
    }

    // write text file
    if (tableLayout)
    {
        textOutControl.mode = textOutTableLayout;
        textOutControl.fixedPitch = fixedPitch;
    }
    else if (physLayout)
    {
        textOutControl.mode = textOutPhysLayout;
        textOutControl.fixedPitch = fixedPitch;
    }
    else if (simpleLayout)
    {
        textOutControl.mode = textOutSimpleLayout;
    }
    else if (simple2Layout)
    {
        textOutControl.mode = textOutSimple2Layout;
    }
    else if (linePrinter)
    {
        textOutControl.mode = textOutLinePrinter;
        textOutControl.fixedPitch = fixedPitch;
        textOutControl.fixedLineSpacing = fixedLineSpacing;
    }
    else if (rawOrder)
    {
        textOutControl.mode = textOutRawOrder;
    }
    else
    {
        textOutControl.mode = textOutReadingOrder;
    }
    textOutControl.clipText = clipText;
    textOutControl.discardDiagonalText = discardDiag;
    textOutControl.insertBOM = insertBOM;
    textOutControl.marginLeft = marginLeft;
    textOutControl.marginRight = marginRight;
    textOutControl.marginTop = marginTop;
    textOutControl.marginBottom = marginBottom;
    textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
    if (textOut->isOk())
    {
        doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
    }
    else
    {
        delete textOut;
        exitCode = 2;
        goto err3;
    }
    delete textOut;

    exitCode = 0;

    // clean up
err3:
    delete textFileName;
err2:
    delete doc;
//  uMap->decRefCnt();
err1:
    delete globalParams;

    // check for memory leaks
    Object::memCheck(stderr);
    gMemReport(stderr);

    return exitCode;
}

So far, so good. But this code isn't thread safe: if I am trying run this code inside a multi-threading code, it crashes:

// TextOutputDev.cc
if (uMap->isUnicode())
{

    lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); // <-- crash

Why ? Because there is a variable, globalParams, which is deleted in the last lines of the function, and it's common for all threads:

delete globalParams;

And globalParams it's an extern global variable from GlobalParams.h (part of xpdf code):

// xpdf/GlobalParams.h
// The global parameters object.
extern GlobalParams *globalParams;

How can I do this function thread safe ? Because the "problem variable" it's inside xpdf source code, not in mine ...

P.S. To sum up things, globalParams it's declared in xpdf code, and it's cleaned in my (client) code.

The xpdf source code could be seen here: https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.h

and

https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.cc

Flaviu_
  • 1,285
  • 17
  • 33
  • Do you know what `std::mutex` is and how it works? – Sam Varshavchik Sep 19 '22 at 11:23
  • Yes, I know what `mutex` is, and do you think this will solve the problem ? See my last comment: `globalParams` it's declared in xpdf code, and it's cleaned in my (client) code. So, what is the point with `std::mutex` ? – Flaviu_ Sep 19 '22 at 11:47
  • Yes, proper use of mutex should work. That's it's entire reason for existence: block execution threads from simultaneously executing non-thread safe code or data. That sounds exactly what you need? – Sam Varshavchik Sep 19 '22 at 11:56
  • Hot take: `goto` has been deprecated in favor of [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization). This is [tag:c++], right? – IInspectable Sep 19 '22 at 12:21
  • @IInspectable yes, I know, it's just a legacy code. The issue remain with or without `goto` statement – Flaviu_ Sep 19 '22 at 12:30
  • @SamVarshavchik the point is to have a concurential execution. – Flaviu_ Sep 19 '22 at 12:31
  • 2
    Better check the code of the `GlobalParams` class. Unless its members are static, or there's some tricky code to make it behaving like a singleton, this code shouldn't be crashing, you create it, use it and delete it. You could modify its code so that it's never deleted (leaving one instance in memory throughout the app's lifecycle won't make it resource-hungry). You may even consider defining a new class. But finally, if it's really a singleton, why ever delete it? Access, like the `setTextEncoding()` call could be serialized or called once, eg during initialization. – Constantine Georgiou Sep 19 '22 at 13:16
  • 1
    If you want your code to be able of performing multiple parallel conversions using different encoding, EOL, page-break etc etc parameters, the Params would need to be instantiated per thread, so they can't be Global, you will need a derived class in this case. – Constantine Georgiou Sep 19 '22 at 13:16
  • 3
    Then more work has to be done. A lot of work. There are very few make-it-happen buttons in C++ hiding somewhere, that merely need to be discovered and pushed in order to make everything work to one's desires. This isn't one of them. If xpdf's internals are not thread safe, then they are not thread safe. That's it. The only quick solution here is to protect everything via a mutex, or use multiple processes instead of multiple execution threads. – Sam Varshavchik Sep 19 '22 at 14:00

1 Answers1

1

Try restructuring your code as shown below. I have moved the GlobalParams initialization code into a separate function. This function should be called (once) during initialization, or before starting the threads that call Pdf2Txt(). And of course the GlobalParams instance shouldn't be destroyed because it can be used by multiple threads. It won't hurt your app to keep it memory, it's one object anyway and not really large - well, it contains many int and bool member variables, but these do not take up much space, and quite a few string* variables (initially null or emtpy I guess), so it's just a few KB at most.

bool InitGlobalParams()
{
    UnicodeMap* uMap;
    char textEncName[128] = "";
    char textEOL[16] = "";
    GBool noPageBreaks = gFalse;
    GBool quiet = gFalse;

    // read config file
    globalParams = new GlobalParams(""); // <-- Maybe add some checking code here?
    if (textEncName[0])
    {
        globalParams->setTextEncoding(textEncName);
    }
    if (textEOL[0])
    {
        if (!globalParams->setTextEOL(textEOL))
        {
            fprintf(stderr, "Bad '-eol' value on command line\n");
        }
    }
    if (noPageBreaks)
    {
        globalParams->setTextPageBreaks(gFalse);
    }
    if (quiet)
    {
        globalParams->setErrQuiet(quiet);
    }

    // Set UNICODE support
    globalParams->setTextEncoding("UTF-8");

    // get mapping to output encoding
    if (!(uMap = globalParams->getTextEncoding()))
    {
        error(errConfig, -1, "Couldn't get text encoding");
        return false;
    }
    return true;
}

int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
    GString* ownerPW, *userPW;
    TextOutputDev* textOut;
    TextOutputControl textOutControl;
    GString* textFileName;

    int exitCode;
    char ownerPassword[33] = "\001";
    char userPassword[33] = "\001";
    int firstPage = 1;
    int lastPage = 0;
    GBool tableLayout = gFalse;
    double fixedPitch = 0;
    GBool physLayout = gFalse;
    GBool simpleLayout = gFalse;
    GBool simple2Layout = gFalse;
    GBool linePrinter = gFalse;
    GBool rawOrder = gFalse;
    double fixedLineSpacing = 0;
    double marginLeft = 0;
    double marginRight = 0;
    double marginTop = 0;
    double marginBottom = 0;
    GBool clipText = gFalse;
    GBool discardDiag = gFalse;
    GBool insertBOM = gFalse;

    exitCode = 99;

    // open PDF file
    if (ownerPassword[0] != '\001')
    {
        ownerPW = new GString(ownerPassword);
    }
    else
    {
        ownerPW = NULL;
    }
    if (userPassword[0] != '\001')
    {
        userPW = new GString(userPassword);
    }
    else
    {
        userPW = NULL;
    }
    PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
    if (userPW)
    {
        delete userPW;
    }
    if (ownerPW)
    {
        delete ownerPW;
    }
    if (! doc->isOk())
    {
        exitCode = 1;
        goto err2;
    }

    // check for copy permission
    if (! doc->okToCopy())
    {
        error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
        exitCode = 3;
        goto err2;
    }

    // construct text file name
    textFileName = new GString(TxtFile.c_str());

    // get page range
    if (firstPage < 1)
    {
        firstPage = 1;
    }
    if (lastPage < 1 || lastPage > doc->getNumPages())
    {
        lastPage = doc->getNumPages();
    }

    // write text file
    if (tableLayout)
    {
        textOutControl.mode = textOutTableLayout;
        textOutControl.fixedPitch = fixedPitch;
    }
    else if (physLayout)
    {
        textOutControl.mode = textOutPhysLayout;
        textOutControl.fixedPitch = fixedPitch;
    }
    else if (simpleLayout)
    {
        textOutControl.mode = textOutSimpleLayout;
    }
    else if (simple2Layout)
    {
        textOutControl.mode = textOutSimple2Layout;
    }
    else if (linePrinter)
    {
        textOutControl.mode = textOutLinePrinter;
        textOutControl.fixedPitch = fixedPitch;
        textOutControl.fixedLineSpacing = fixedLineSpacing;
    }
    else if (rawOrder)
    {
        textOutControl.mode = textOutRawOrder;
    }
    else
    {
        textOutControl.mode = textOutReadingOrder;
    }
    textOutControl.clipText = clipText;
    textOutControl.discardDiagonalText = discardDiag;
    textOutControl.insertBOM = insertBOM;
    textOutControl.marginLeft = marginLeft;
    textOutControl.marginRight = marginRight;
    textOutControl.marginTop = marginTop;
    textOutControl.marginBottom = marginBottom;
    textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
    if (textOut->isOk())
    {
        doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
    }
    else
    {
        delete textOut;
        exitCode = 2;
        goto err3;
    }
    delete textOut;

    exitCode = 0;

    // clean up
err3:
    delete textFileName;
err2:
    delete doc;
//  uMap->decRefCnt();
err1:
    // Do NOT delete the one and only GlobalParams instance!!!
    //delete globalParams;

    // check for memory leaks
    Object::memCheck(stderr);
    gMemReport(stderr);

    return exitCode;
}

The above code may not even compile (I modified it with a text editor, not really tested it) so please make any changes that may be required. It is quite expected that the xpdf functions do not modify the globalParams object (it's "read-only" for them) so this code has a good chance to work. Btw there is a #if MULTITHREADED directive in the GlobalParams class definition (GlobalParams.h) containing 3 mutex objects in its block. The implementation code (GlobalParams.cc) locks a mutex to access the GlobalParams members, so this may cause some threads to wait a little, although I can't tell how much (one would have to thoroughly examine the code, which is a small "project" in itself). You can try testing it.

Of course the concerns expressed by @KJ above still apply, running many such threads in parallel could overload the system (although i'm not sure if xpdf uses multiple threads to process a single PDF, could one help please, how is it configured?), esp if you are running this on a server do not allow too many concurrently-running conversions, it may cause other processes to slow down. It may also cause I/O bottleneck (disk and/or network), so experiment with few threads initially and check how it scales up.

Constantine Georgiou
  • 2,412
  • 1
  • 13
  • 17
  • Yes, `MULTITHREADED` flag is set it up. And I have made the modification you suggested, and it works perfect !!!! You got me out from a trouble !! How can I thank you ? I'll always vote you here ! – Flaviu_ Sep 21 '22 at 15:50