I had same issue since last few days. I resolved by removing tess4j and using Tika 1.27 + tesseract.
I used Executor service to run 3 threads at a time this kept memory within limits.
byte fileBytes[] ; // image bytes
Future<String> future = executorService.submit(() -> {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setLanguage("kor+eng");
config.setEnableImageProcessing(1);
config.setPreserveInterwordSpacing(true);
ParseContext context = new ParseContext();
context.set(TesseractOCRConfig.class, config);
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(fileBytes), handler, metadata, context);
return handler.toString();
});
fileBody = future.get(120, TimeUnit.SECONDS);
While the code given above works, later i made it simpler by just spawning a process to call tesseract directly.
protected String doOcr(byte[] fileBytes, int timeout, String language) {
String text = null;
File inputFile = null;
File outputFile = null;
try {
inputFile = File.createTempFile("tesseract-input", ".png");
String outputPath = inputFile.getAbsolutePath() + "-output";
outputFile = new File(outputPath + ".txt");
try (FileOutputStream fos = new FileOutputStream(inputFile)) {
fos.write(fileBytes);
}
String commandCreate[] = { "tesseract", inputFile.getAbsolutePath(), outputPath, "-l", language, "--psm", "1" ,"-c", "preserve_interword_spaces=1"};
runCommand(commandCreate, timeout);
if (outputFile.exists()) {
try (FileInputStream fis = new FileInputStream(outputFile)) {
text = IOUtils.toString(fis, Constants.UTF_8);
}
}
} catch (InterruptedException e) {
logger.warn("timeout trying to read image file body");
} catch (Exception e) {
logger.error(String.format("Cannot read image file body, error : %s", e.getMessage()), e);
} finally {
if (null != inputFile && inputFile.exists()) {
inputFile.delete();
}
if (null != outputFile && outputFile.exists()) {
outputFile.delete();
}
}
return text;
}
protected void runCommand(String command[], int timeout) throws IOException, InterruptedException {
logger.info("command : " + StringUtils.join(command, " "));
ProcessBuilder builder = new ProcessBuilder(command);
builder.inheritIO();
builder.environment().put("OMP_THREAD_LIMIT", "1"); /* default tesseract uses 4 threads per image. set to 1 */
Process p = builder.start();
boolean finished = p.waitFor(timeout, TimeUnit.SECONDS);
if (!finished) {
logger.warn("task not finished");
}
p.destroyForcibly();
}