1

I am trying to parse a .doc file using Apache Tika which contains greek characters like alpha,beta,gamma in it and the result from tika is completely different from what I Expected , I am using the below code for parsing .doc file

FileInputStream fileInputStream = new FileInputStream();
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1);
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Metadata metadatafromtika = new Metadata();
metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");
parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            String text = handler.toString();

I am using the UTF-8 encoding in the line

metadatafromtika.add(Metadata.CONTENT_ENCODING,"UTF-8");

and below are the dependencies which I am using

<dependencies>
<dependency>
  <groupId>org.apache.tika</groupId>
  <artifactId>tika-parsers</artifactId>
  <version>1.18</version>
</dependency>

<dependency>
  <groupId>commons-collections</groupId>
  <artifactId>commons-collections</artifactId>
  <version>3.2.1</version>
</dependency>

<dependency>
  <groupId>org.apache.logging.log4j</groupId>
  <artifactId>log4j-core</artifactId>
  <version>2.9.1</version>
</dependency>

<dependency>
  <groupId>org.antlr</groupId>
  <artifactId>ST4</artifactId>
  <version>4.0.8</version>
</dependency>

<dependency>
  <groupId>org.postgresql</groupId>
  <artifactId>postgresql</artifactId>
  <version>42.1.4</version>
</dependency>

<dependency>
  <groupId>com.zaxxer</groupId>
  <artifactId>HikariCP</artifactId>
  <version>2.7.2</version>
</dependency>

<dependency>
  <groupId>commons-dbutils</groupId>
  <artifactId>commons-dbutils</artifactId>
  <version>1.6</version>
</dependency>

<dependency>
  <groupId>commons-io</groupId>
  <artifactId>commons-io</artifactId>
  <version>2.5</version>
</dependency>

<dependency>
  <groupId>org.json</groupId>
  <artifactId>json</artifactId>
  <version>20171018</version>
</dependency>

<dependency>
  <groupId>org.apache.hive</groupId>
  <artifactId>hive-jdbc</artifactId>
  <version>1.1.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-common</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-hdfs</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-mapreduce-client-core</artifactId>
  <version>2.6.0-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-tools</artifactId>
  <version>2.6.0-mr1-cdh5.10.1</version>
</dependency>

<dependency>
  <groupId>org.apache.htrace</groupId>
  <artifactId>htrace-core4</artifactId>
  <version>4.0.1-incubating</version>
</dependency>

<dependency>
  <groupId>com.google.code.gson</groupId>
  <artifactId>gson</artifactId>
  <version>2.8.1</version>
</dependency>

<dependency>
  <groupId>com.levigo.jbig2</groupId>
  <artifactId>levigo-jbig2-imageio</artifactId>
  <version>1.6.5</version>
</dependency>

<dependency>
  <groupId>com.github.jai-imageio</groupId>
  <artifactId>jai-imageio-core</artifactId>
  <version>1.3.1</version>
</dependency>

<dependency>
  <groupId>com.fasterxml.jackson.core</groupId>
  <artifactId>jackson-core</artifactId>
  <version>2.9.5</version>
</dependency>

the content in the word document is

enter image description here

the output which I get when I use the above tika code is

enter image description here

Is UTF-8 encoding not suitable for parsing greek characters using Apache Tika? or Am I missing anything in the code ?

Thanks in advance

EDIT:here is the complete java code which I am using

      import org.apache.commons.io.FileUtils;
      import org.apache.tika.metadata.Metadata;
      import org.apache.tika.parser.AutoDetectParser;
      import org.apache.tika.parser.ParseContext;
      import org.apache.tika.parser.Parser;
      import org.apache.tika.sax.BodyContentHandler;
      import java.io.File;
      import java.io.FileInputStream;
      import java.nio.charset.StandardCharsets;


    public class Tika {

    public static void main(String[] args) {
        try {
            String inputPath = args[0];
            String outputPath = args[1];
            File f = new File(inputPath);
            System.out.println("path is : " + f.getAbsoluteFile());
            FileInputStream fileInputStream = new FileInputStream(f);
            Parser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler(-1);
            ParseContext parseContext = new ParseContext();
            parseContext.set(Parser.class, parser);
            Metadata metadatafromtika = new Metadata();
            metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
            parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            String text = handler.toString();

            System.out.println("done parsing for file : " + f.getAbsolutePath());
            System.out.println("text is : \n" + text);

            byte[] bytes = text.getBytes();
            String encodedText = new String(bytes, StandardCharsets.UTF_8);
            System.out.println("encoded text is : " + encodedText);

            FileUtils.writeStringToFile(new File(outputPath + File.separator + f.getName() + "_content.txt"),
                text, "UTF-8");
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
}

EDIT 2 : Below is the code which uses PrintWriter

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;


public class TikaTmp {

    public static void main(String[] args) {
        FileInputStream fileInputStream = null;
        try {
            String inputPath = args[0];
            String outputPath = args[1];
            File f = new File(inputPath);
            System.out.println("path is : " + f.getAbsoluteFile());
            fileInputStream = new FileInputStream(f);
            Parser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler(-1);
            ParseContext parseContext = new ParseContext();
            parseContext.set(Parser.class, parser);
            Metadata metadatafromtika = new Metadata();
            metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
            parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
            PrintWriter printWriter = new PrintWriter(new File(
                output_path + File.separator + f.getName() +"_content.txt"),"UTF-8");
            printWriter.write(String.valueOf(handler));
            printWriter.flush();
            printWriter.close();

        }
        catch (Exception e) {
            e.printStackTrace();
        }
        finally {
            try {
                if(fileInputStream!=null)
                    fileInputStream.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

EDIT 3: the characters which I am trying to parse are from the symbol font which microsoft word uses,Tika fails only for the characters from the symbol font

enter image description here

I am assuming that these are not the actual greek characters ,but look like greek characters

Akhil
  • 391
  • 3
  • 20
  • How are you looking at the string you get back from Tika? In your IDE? On the console? Printed to a file? I think your encoding problem is on output from Tika... – Gagravarr Aug 07 '20 at 08:50
  • @Gagravarr I see that on the console as well as in the file ,the output is same.Is there a way to fix the encoding problem – Akhil Aug 07 '20 at 09:17
  • Set the encoding you want when you print out then – Gagravarr Aug 07 '20 at 09:36
  • @Gagravarr, this is what I did byte[] bytes = text.getBytes(); String encodedText= new String(bytes, StandardCharsets.UTF_8); System.out.println("encoded text is : "+encodedText); Is this what you were suggesting? , Let me know If I am missing anything – Akhil Aug 07 '20 at 09:44
  • You should edit your code into your question, so we can see what you're doing. That code looks wrong and likely the source of your problem, but with just a snippet in a comment it's hard to tell... – Gagravarr Aug 07 '20 at 10:41
  • @Gagravarr,I posted the java code in the question – Akhil Aug 07 '20 at 11:18
  • Strings in Java are already held in Unicode, so you are corrupting your data with your code. Just use something like a PrintWriter to do the string to output conversion once – Gagravarr Aug 07 '20 at 15:39
  • @Gagravarr, I used the below printwriter code PrintWriter printWriter = new PrintWriter( new File(outputPath + File.separator + f.getName() + "_content.txt")); printWriter.write(String.valueOf(handler)); printWriter.flush(); printWriter.close(); I removed the line String text=handler.toString() and used the printwriter as you suggested ,But still the output is same , Let me know if I missed anything – Akhil Aug 11 '20 at 09:19
  • You do need to set UTF-8 when you use the writer, if you edit your code into your question it's easier to see and suggest changes – Gagravarr Aug 11 '20 at 09:45
  • @Gagravarr,I have put my code which uses PrintWriter in the question ,I have also used UTF-8 encoding with the PrintWriter Object ,But still the output is same – Akhil Aug 11 '20 at 10:19
  • @Gagravarr I found the issue , the characters which I am trying to parse are the ones which are from the Symbol font in microsoft word , Microsoft word has a special font called Symbol which contains some maths and greek characters I assume those are not the real greek characters ,but they look like a greek character,I have posted an image of the symbol font in question , Tika parses the greek characters correctly ,But when it encounters characters from symbol font it cannot parse them correctly – Akhil Aug 27 '20 at 09:34

0 Answers0