I am able to extract the text from images or extract the images out of a pdf and then to ocr to get the text - but I want to create a searchable pdf out of images or a pdf with images in it. It must somehow work with ITessAPI (TessAPI1 or Tesseract1) but I don't know how to start here correctly.
Edit:
public void doOcr(String tessDataPath, String imagePath) throws IOException {
int set_only_init_params = FALSE;
int oem = ITessAPI.TessOcrEngineMode.OEM_DEFAULT;
PointerByReference configs = null;
int configs_size = 0;
String[] params = {"load_system_dawg", "tessedit_char_whitelist"};
String vals[] = {"F", ""}; //0123456789-.IThisalotfpnex
//PointerByReference vars_vec = new PointerByReference();
//vars_vec.setPointer(new Pointer(params));
//PointerByReference vars_values = new PointerByReference();
//vars_values.setPointer(new StringArray(vals));
NativeSize vars_vec_size = new NativeSize(params.length);
TessAPI1 api = new TessAPI1();
ITessAPI.TessBaseAPI handle = api.TessBaseAPICreate();
int rc = api.TessBaseAPIInit4(handle, tessDataPath, "eng", oem, configs, configs_size, null, null, vars_vec_size, set_only_init_params);
api.TessBaseAPISetVariable(handle, "user_defined_dpi", "270");
if (rc != 0) {
api.TessBaseAPIDelete(handle);
//logger.error("Could not initialize tesseract.");
return;
}
String outputbase = "C:\\Application\\ResultRenderer";
ITessAPI.TessResultRenderer renderer = api.TessHOcrRendererCreate(outputbase);
api.TessResultRendererInsert(renderer, api.TessBoxTextRendererCreate(outputbase));
api.TessResultRendererInsert(renderer, api.TessTextRendererCreate(outputbase));
String dataPath = api.TessBaseAPIGetDatapath(handle);
api.TessResultRendererInsert(renderer, api.TessPDFRendererCreate(outputbase, dataPath, FALSE));
int result = api.TessBaseAPIProcessPages(handle, imagePath, null, 0, renderer);
if (result == FALSE) {
//logger.error("Error during processing.");
return;
}
while ((renderer = api.TessResultRendererNext(renderer)) != null) {
String ext = api.TessResultRendererExtention(renderer).getString(0);
String log = String.format("TessResultRendererExtention: %s\nTessResultRendererTitle: %s\nTessResultRendererImageNum: %d",
ext,
api.TessResultRendererTitle(renderer).getString(0),
api.TessResultRendererImageNum(renderer));
}
api.TessDeleteResultRenderer(renderer);
api.TessBaseAPIEnd(handle);
File ocrPdf = new File(outputbase + ".pdf");
if (!ocrPdf.exists()) {
// TODO
}
}