4

I am trying to build a custom NER using Apache OpenNLP 1.7. From the documentation available Here, I have developed the following code

import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderFactory;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

public class PersonClassifierTrainer {

        static String modelFile = "/opt/NLP/data/en-ner-customperson.bin";

        public static void main(String[] args) throws IOException {

            Charset charset = Charset.forName("UTF-8");
            **ObjectStream<String> lineStream = new PlainTextByLineStream(new FileInputStream("/opt/NLP/data/person.train"), charset);**
            ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

            TokenNameFinderModel model;
            TokenNameFinderFactory nameFinderFactory = null;

            try {
                model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(),
                        nameFinderFactory);
            } finally {
                sampleStream.close();
            }

            BufferedOutputStream modelOut = null;

            try {
                modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
                model.serialize(modelOut);
            } finally {
                if (modelOut != null)
                    modelOut.close();
            }
        }
    }

The code highlighted above, shows - 'Cast argument 'file' to 'insputstreamfactory'

I am forced to cast this, because it shows error otherwise.

Now when I run my code, I get the following error

java.io.FileInputStream cannot be cast to opennlp.tools.util.InputStreamFactory

Is there anything missing here?

Edit 1: Person.train file has this data

<START:person> Hardik <END> is a software Professional.<START:person> Hardik works at company<END> and <START:person> is part of development team<END>. <START:person> Hardik<END> lives in New York
<START:person> Hardik<END> loves R statistical software
<START:person> Hardik<END> is a student at ISB
<START:person> Hardik<END> loves nature

Edit2: I am now getting null pointer exception, any help?

Vadim Kotov
  • 8,084
  • 8
  • 48
  • 62
Hardik Gupta
  • 4,700
  • 9
  • 41
  • 83

1 Answers1

6

You need an instance of InputStreamFactory which will retrieve your InputStream. Additionally, TokenNameFinderFactory must not be null.

public class PersonClassifierTrainer {

    static String modelFile = "/opt/NLP/data/en-ner-customperson.bin";

    public static void main(String[] args) throws IOException {

        InputStreamFactory isf = new InputStreamFactory() {
            public InputStream createInputStream() throws IOException {
                return new FileInputStream("/opt/NLP/data/person.train");
            }
        };

        Charset charset = Charset.forName("UTF-8");
        ObjectStream<String> lineStream = new PlainTextByLineStream(isf, charset);
        ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

        TokenNameFinderModel model;
        TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();

        try {
            model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(),
                    nameFinderFactory);
        } finally {
            sampleStream.close();
        }

        BufferedOutputStream modelOut = null;

        try {
            modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
            model.serialize(modelOut);
        } finally {
            if (modelOut != null)
                modelOut.close();
        }
    }
}

Edit 1: Person.train file has this data

<START:person> Hardik <END> is a software Professional.<START:person> Hardik works at company<END> and <START:person> is part of development team<END>. <START:person> Hardik<END> lives in New York
<START:person> Hardik<END> loves R statistical software
<START:person> Hardik<END> is a student at ISB
<START:person> Hardik<END> loves nature
schrieveslaach
  • 1,689
  • 1
  • 15
  • 32
  • it works, but now am getting null pointer exception, can you try once to run at your end, have attached my person.train file content – Hardik Gupta Jan 17 '17 at 14:53
  • `TokenNameFinderFactory` must not be null. See updated code. – schrieveslaach Jan 17 '17 at 15:27
  • Schrieveslaach, I think it's important for Apache to update this in the official documentation, surprising how no one faced this issue till date. – Hardik Gupta Jan 18 '17 at 02:05
  • Can you create an issue (see [here](https://issues.apache.org/jira/browse/OPENNLP/?selectedTab=com.atlassian.jira.jira-projects-plugin:summary-panel))? – schrieveslaach Jan 18 '17 at 08:19