4

We are using php, pypdfocr, and pdftotext to OCR and extract text from documents that have been scanned in or faxed to us. The issue is when the document is scanned or faxed upside down or if some pages are intended to be read landscape (so the text is rotated 90 degrees on the page)

Things I have tried:

  • in tessdata cp eng.traineddata osd.traineddata

The resulting OCR text layer for pages which have 90 degree text isn't bad, however pages that are upside down, it OCR's each word and flips it in place so that if 'This is a test' appears in the document but upside down then the text layer may read 'test a is This'

If there is a way to detect that a page is upside down I can use pdftk to rotate the pages before I run it through the OCR (or i can remove the text layer if it was OCR'd and run it though the OCR again after using pdftk to rotate)

Any solution that can be executed from a linux CLI at this point is a viable solution.

tempcke
  • 700
  • 8
  • 15

3 Answers3

5

You can get info about page orientation with tesseract (>=3.03 ?) easily. E.g.

$ tesseract image.png -  -psm 0

will produce this output

Orientation: 3
Orientation in degrees: 90
Orientation confidence: 25.40
Script: 1 
Script confidence: 18.40

Based on this information you can adjust image rotation. Example how to do it in python can be e.g. at script Fix image rotation with tesseract.

user898678
  • 2,994
  • 2
  • 18
  • 17
  • Thanks, with this I can write a script to auto process the que (fix rotation and OCR) Once I have it finished I will post it back here in case it is useful to someone else in the future. – tempcke Aug 05 '15 at 19:02
  • 2
    With tesseract v4, this seems to work: `tesseract image.png - --psm 0` – jftuga May 02 '19 at 14:21
1

I had this same problem. My fix was to create a simple C++ application, which takes a PNG file name as parameter and rotates/deskews it automatically.

My code is

#include <iostream>
#include <cmath>
#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>

using namespace std;

int main(int argc, char **argv)
{

    if (argc != 2) {
        cerr << "usage: " << argv[0] << " <image>\n";
        exit(1);
    }

    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    // Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        cerr << "Could not initialize tesseract.\n";
        exit(2);
    }

    const char* inputfile = argv[1];
    tesseract::Orientation orientation;
    tesseract::WritingDirection direction;
    tesseract::TextlineOrder order;
    float deskew_angle;

    PIX *image = pixRead(inputfile);
    if (image == NULL) {
        cerr << "could not open " << inputfile << endl;
        return -2;
    }

    api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
    api->SetImage(image);
    api->Recognize(0);

    tesseract::PageIterator* it =  api->AnalyseLayout();
    it->Orientation(&orientation, &direction, &order, &deskew_angle);
    cout << "Orientation: " << orientation << 
            "\nWritingDirection: " << direction <<
            "\nTextlineOrder: " << order << 
            "\nDeskew angle: " << deskew_angle << "\n";

    PIX* pixd = NULL;
    switch (orientation) {
        case 0:
            cout << "image in the correct position, nothing to do\n";
            if (fabs(deskew_angle) > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(image, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 1:
            cout << "rotating image by 270 degrees\n";
            pixd = pixRotate90(image, -1);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 2:
            cout << "rotating image by 180 degrees\n";
            pixd = pixRotate180(NULL, image);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
        case 3:
            cout << "rotating image by 90 degrees\n";
            pixd = pixRotate90(image, 1);
            if (deskew_angle > 0.0001f) {
                cout << "deskewing...\n";
                pixd = pixRotate(pixd, -deskew_angle, L_ROTATE_SHEAR, L_BRING_IN_WHITE, 0, 0);
            }
            break;
    }

    pixDestroy(&image);

    if (pixd != NULL) {
        pixWrite(inputfile, pixd, IFF_PNG);
        pixDestroy(&pixd);
    }

    return 0;
}

You can compile it with

g++ -o tesseract_fixposition tesseract_fixposition.cpp -llept -ltesseract

The dependencies are libtesseract and libleptonica. I tested with Tesseract versions 3.03 and 3.04, and Leptonica 1.72. I processed a few thousand images and didn't find any incorrect identification.

Hope this helps!

gxrmr
  • 11
  • 1
  • Nice. What I'm actually dealing with is a PDF though so I wrote a script in php which loops though the pages of a pdf, it runs the shell command convert to convert each page to a png, then i execute tesseract on the png page and parse the output, building a string as I go though the loop to pass to pdftk. This works perfectly but it took 4 mins to fix the rotation and then OCR the final document. The idea of a complied c++ app seems like a good idea as it would likely speed things up considerably. How hard would it be to modify this script to correct each page of a pdf? – tempcke Aug 06 '15 at 13:29
  • In my case it takes a few seconds per page. I use this app in a workflow that extracts each page using muPDF, prepares it for extraction and runs Tesseract for extraction. I use `mudraw -r 300 -g -o ` for each page, then this app for the generated PNGs. – gxrmr Aug 06 '15 at 15:01
1

If speed is issue you do not neet to use tesseract for fixing page orientation. You can use just leptonica functions. Something like this:

/*
 * Compile with:
 *     g++ fixorientation.cpp -o fixorientation -llept
 *
 */

#include <cstring>
#include <leptonica/allheaders.h>

int main(int argc, char *argv[]) {
    const char* filename = NULL;
    const char* outfile = NULL;
    l_int32   orient, format;
    l_int32  alt_rot = -1;
    l_float32 upconf1, leftconf1;
    PIX       *fpixs, *pixs;

    if (argc < 1) {
        fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
        return(1);
    } else {
        for (int i = 1; i < argc; i++) {
            if (i + 1 < argc) {
                if (strcmp(argv[i], "-f") == 0) {
                    filename = argv[i + 1];
                } else if (strcmp(argv[i], "-o") == 0) {
                    outfile = argv[i + 1];
                }
            }
        }
    }

    if (filename) {
        pixs = pixRead(filename);
    } else {
        fprintf(stderr, "Usage is:\n\t%s -f filename [-o output]\n", argv[0]);
        return(1);
    }

    if (pixs == NULL) {
        fprintf(stderr, "Unsupported image type.\n");
        return(3);
    }
    format = pixGetInputFormat(pixs);

    fpixs = pixConvertTo1(pixs, 130);
    pixOrientDetect(fpixs, &upconf1, &leftconf1, 0, 0);
    makeOrientDecision(upconf1, leftconf1, 0, 0, &orient, 1);

    if (orient == L_TEXT_ORIENT_UNKNOWN) {
        fprintf(stdout, "Confidence is low; no determination is made. "
                "But maybe there is %1 deg rotation.\n", alt_rot);
    } else if (orient == L_TEXT_ORIENT_UP) {
        fprintf(stdout, "Text is rightside-up\n");
        alt_rot = 0;
    } else if (orient == L_TEXT_ORIENT_LEFT) {
        fprintf(stdout, "Text is rotated 90 deg ccw\n");
        alt_rot = 1;
    } else if (orient == L_TEXT_ORIENT_DOWN) {
        fprintf(stdout, "Text is upside-down\n");
        alt_rot = 2;
    } else {  /* orient == L_TEXT_ORIENT_RIGHT */
        fprintf(stdout, "Text is rotated 90 deg cw\n");
        alt_rot = 3;
    }

    if (alt_rot > -1) {
        fpixs = pixRotateOrth(pixs, alt_rot);
        if (outfile) {
            pixWrite(outfile, fpixs, format);
        } else {
            char savefile[strlen("fixed_") + strlen(filename) + 1];
            strcpy(savefile, "fixed_");
            strcat(savefile, filename);
            fprintf(stdout, "Output save to %s\n", savefile);
            pixWrite(savefile, fpixs, format);

        }
    } else {
        return(2);
    }
    pixDestroy(&fpixs);
    pixDestroy(&pixs);
    return(0);
}
user898678
  • 2,994
  • 2
  • 18
  • 17