0

I have noticed that whenever I use dark background images for tesseract I am getting a segmentation fault. I was trying to extract symbols using this code

#include <tesseract/baseapi.h>
#include <leptonica/allheaders.h>
#include <iostream>
#include <map>
#include <bits/stdc++.h>
using namespace std;
int main()
{
    char *outText;
    map<pair<char*,char*>,float> matrix;
    set<char> allChars;
    tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
    // Initialize tesseract-ocr with English, without specifying tessdata path
    if (api->Init(NULL, "eng")) {
        fprintf(stderr, "Could not initialize tesseract.\n");
        exit(1);
    }

   int a[256][256];
   for(int i=0;i<256;i++){
     for(int j=0;j<256;j++){
       a[i][j]=0;
     }
   }
    // Open input image with leptonica library
    string images[] = {List of images};
for (int ii=0;ii<7;ii++){
  Pix *image = pixRead((const char*) images[ii].c_str());
  cout << images[ii] << endl;
   api->Init(NULL, "eng");
   api->SetImage(image);
   string valid_set = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890~`!@#$%^&*()_-+=,./<>/:;'[]{}|";
   api->SetVariable("tessedit_char_whitelist", valid_set.c_str());
   api->SetVariable("save_blob_choices", "T");
   //api->SetRectangle(37, 128,648, 523);
   //api->SetRectangle(30, 86, 590, 100);
   //api->SetRectangle(30,100,430,30);
   api->Recognize(NULL);

   tesseract::ResultIterator* ri = api->GetIterator();
   tesseract::PageIteratorLevel level = tesseract::RIL_SYMBOL;
   if(ri != 0) {
       do {
           const char* symbol = ri->GetUTF8Text(level);
           //cout << symbol << endl;
           if(ri!=0){
             float conf = ri->Confidence(level);
           }
           //cout << "err" << endl;
           if(symbol != 0) {
              //printf("symbol %s, conf: %f", symbol, conf);
               bool indent = false;
               tesseract::ChoiceIterator ci(*ri);
               do {
                   const char* choice = ci.GetUTF8Text();
                   if (indent) //printf("\t\t ");
                  // printf("\t- ");
                  //cout << symbol<<" Look up "<<choice << endl;
                  matrix[make_pair(strdup(symbol), strdup(choice))]=ci.Confidence();
                   //printf("%s conf: %f\n", choice, ci.Confidence());
                   indent = true;
               } while(ci.Next());
           }
           //printf("---------------------------------------------\n");
           delete[] symbol;
       } while((ri->Next(level)));
   }

   int count = 0;
   for(map<pair<char*,char*>,float>::iterator it = matrix.begin();it!=matrix.end();it++){
      allChars.insert((strdup)(it->first.first)[0]);
      allChars.insert((strdup)(it->first.second)[0]);
      //cout<<it->first.first<<" "<<it->first.second<<endl;
      //cout << (strdup)(it->first.first)[0]<<" "<<(strdup)(it->first.second)[0]<<endl;
      a[(strdup)(it->first.first)[0]][(strdup)(it->first.second)[0]]+=it->second;
      count++;
   }
  // cout << count << endl;
   for(set<char>::iterator it = allChars.begin();it!=allChars.end();it++){
     //cout << *it << endl;
   }
   for(int i=0;i<256;i++){
     for(int j=0;j<256;j++){
       if(a[i][j]!=0){
        ///cout << i << " " <<j<<endl;
        //cout << a[i][j]<<endl;
       }
       //cout << a[i][j] << endl;
     }
   }

   api->End();
    pixDestroy(&image);
}
    return 0;

}`

Specifically in the code where

float conf = ri->Confidence(level);

So what can be the solution to this problem? Should we train with more dark images?

Edit: Sample Image enter image description here

Ahmet
  • 7,527
  • 3
  • 23
  • 47
iLoveCamelCase
  • 450
  • 10
  • 21
  • 1
    Could you provide a sample image? – EGHM Sep 25 '15 at 18:13
  • 1
    I think training tesseract can be tedious. For me solution sound exactly like you stated it - image preprocessing. Usually one can get far enough with thresholding images that removes relatively dark background. But it really depends on what kind of image do you have. So if you share a representative sample, people may help you. There also may be a problem with a code, but I have no idea if there is one. – Artem Fedosov Sep 25 '15 at 20:17
  • I tried using AHE algorithm, didn't give me satisfactory results. Any other contrast enhancement algorithms? – iLoveCamelCase Sep 25 '15 at 20:20
  • Fred has some examples that start off looking a bit like your image for his textcleaner script http://www.fmwconcepts.com/imagemagick/textcleaner/index.php – EGHM Sep 25 '15 at 23:43
  • Tried this background has improved but still getting the segmentation fault. – iLoveCamelCase Sep 26 '15 at 13:17

0 Answers0