Use the full Stanford CoreNLP pipeline to handle your NLP tool chain. Avoid your own tokenizer, cleaner, POS tagger, etc. It will not play well with the NER tool.
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
unzip http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
cd stanford-corenlp-full-2015-12-09
echo "Jack Frost works for Boeing Company. He manages 5 aircraft and their crew in London" > test.txt
java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file test.txt
cat test.txt.out
[out]:
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>Jack</word>
<lemma>Jack</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>4</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>PERSON</NER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>Frost</word>
<lemma>Frost</lemma>
<CharacterOffsetBegin>5</CharacterOffsetBegin>
<CharacterOffsetEnd>10</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>PERSON</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>works</word>
<lemma>work</lemma>
<CharacterOffsetBegin>11</CharacterOffsetBegin>
<CharacterOffsetEnd>16</CharacterOffsetEnd>
<POS>VBZ</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>for</word>
<lemma>for</lemma>
<CharacterOffsetBegin>17</CharacterOffsetBegin>
<CharacterOffsetEnd>20</CharacterOffsetEnd>
<POS>IN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="5">
<word>Boeing</word>
<lemma>Boeing</lemma>
<CharacterOffsetBegin>21</CharacterOffsetBegin>
<CharacterOffsetEnd>27</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>ORGANIZATION</NER>
<Speaker>PER0</Speaker>
</token>
<token id="6">
<word>Company</word>
<lemma>Company</lemma>
<CharacterOffsetBegin>28</CharacterOffsetBegin>
<CharacterOffsetEnd>35</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>ORGANIZATION</NER>
<Speaker>PER0</Speaker>
</token>
<token id="7">
<word>.</word>
<lemma>.</lemma>
<CharacterOffsetBegin>35</CharacterOffsetBegin>
<CharacterOffsetEnd>36</CharacterOffsetEnd>
<POS>.</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
<parse>(ROOT (S (NP (NNP Jack) (NNP Frost)) (VP (VBZ works) (PP (IN for) (NP (NNP Boeing) (NNP Company)))) (. .))) </parse>
<dependencies type="basic-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">works</dependent>
</dep>
<dep type="compound">
<governor idx="2">Frost</governor>
<dependent idx="1">Jack</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">works</governor>
<dependent idx="2">Frost</dependent>
</dep>
<dep type="case">
<governor idx="6">Company</governor>
<dependent idx="4">for</dependent>
</dep>
<dep type="compound">
<governor idx="6">Company</governor>
<dependent idx="5">Boeing</dependent>
</dep>
<dep type="nmod">
<governor idx="3">works</governor>
<dependent idx="6">Company</dependent>
</dep>
<dep type="punct">
<governor idx="3">works</governor>
<dependent idx="7">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">works</dependent>
</dep>
<dep type="compound">
<governor idx="2">Frost</governor>
<dependent idx="1">Jack</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">works</governor>
<dependent idx="2">Frost</dependent>
</dep>
<dep type="case">
<governor idx="6">Company</governor>
<dependent idx="4">for</dependent>
</dep>
<dep type="compound">
<governor idx="6">Company</governor>
<dependent idx="5">Boeing</dependent>
</dep>
<dep type="nmod:for">
<governor idx="3">works</governor>
<dependent idx="6">Company</dependent>
</dep>
<dep type="punct">
<governor idx="3">works</governor>
<dependent idx="7">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-ccprocessed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">works</dependent>
</dep>
<dep type="compound">
<governor idx="2">Frost</governor>
<dependent idx="1">Jack</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">works</governor>
<dependent idx="2">Frost</dependent>
</dep>
<dep type="case">
<governor idx="6">Company</governor>
<dependent idx="4">for</dependent>
</dep>
<dep type="compound">
<governor idx="6">Company</governor>
<dependent idx="5">Boeing</dependent>
</dep>
<dep type="nmod:for">
<governor idx="3">works</governor>
<dependent idx="6">Company</dependent>
</dep>
<dep type="punct">
<governor idx="3">works</governor>
<dependent idx="7">.</dependent>
</dep>
</dependencies>
</sentence>
<sentence id="2">
<tokens>
<token id="1">
<word>He</word>
<lemma>he</lemma>
<CharacterOffsetBegin>37</CharacterOffsetBegin>
<CharacterOffsetEnd>39</CharacterOffsetEnd>
<POS>PRP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>manages</word>
<lemma>manage</lemma>
<CharacterOffsetBegin>40</CharacterOffsetBegin>
<CharacterOffsetEnd>47</CharacterOffsetEnd>
<POS>VBZ</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>5</word>
<lemma>5</lemma>
<CharacterOffsetBegin>48</CharacterOffsetBegin>
<CharacterOffsetEnd>49</CharacterOffsetEnd>
<POS>CD</POS>
<NER>NUMBER</NER>
<NormalizedNER>5.0</NormalizedNER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>aircraft</word>
<lemma>aircraft</lemma>
<CharacterOffsetBegin>50</CharacterOffsetBegin>
<CharacterOffsetEnd>58</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="5">
<word>and</word>
<lemma>and</lemma>
<CharacterOffsetBegin>59</CharacterOffsetBegin>
<CharacterOffsetEnd>62</CharacterOffsetEnd>
<POS>CC</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="6">
<word>their</word>
<lemma>they</lemma>
<CharacterOffsetBegin>63</CharacterOffsetBegin>
<CharacterOffsetEnd>68</CharacterOffsetEnd>
<POS>PRP$</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="7">
<word>crew</word>
<lemma>crew</lemma>
<CharacterOffsetBegin>69</CharacterOffsetBegin>
<CharacterOffsetEnd>73</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="8">
<word>in</word>
<lemma>in</lemma>
<CharacterOffsetBegin>74</CharacterOffsetBegin>
<CharacterOffsetEnd>76</CharacterOffsetEnd>
<POS>IN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="9">
<word>London</word>
<lemma>London</lemma>
<CharacterOffsetBegin>77</CharacterOffsetBegin>
<CharacterOffsetEnd>83</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>LOCATION</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
<parse>(ROOT (S (NP (PRP He)) (VP (VBZ manages) (NP (NP (CD 5) (NN aircraft)) (CC and) (NP (NP (PRP$ their) (NN crew)) (PP (IN in) (NP (NNP London)))))))) </parse>
<dependencies type="basic-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="2">manages</dependent>
</dep>
<dep type="nsubj">
<governor idx="2">manages</governor>
<dependent idx="1">He</dependent>
</dep>
<dep type="nummod">
<governor idx="4">aircraft</governor>
<dependent idx="3">5</dependent>
</dep>
<dep type="dobj">
<governor idx="2">manages</governor>
<dependent idx="4">aircraft</dependent>
</dep>
<dep type="cc">
<governor idx="4">aircraft</governor>
<dependent idx="5">and</dependent>
</dep>
<dep type="nmod:poss">
<governor idx="7">crew</governor>
<dependent idx="6">their</dependent>
</dep>
<dep type="conj">
<governor idx="4">aircraft</governor>
<dependent idx="7">crew</dependent>
</dep>
<dep type="case">
<governor idx="9">London</governor>
<dependent idx="8">in</dependent>
</dep>
<dep type="nmod">
<governor idx="7">crew</governor>
<dependent idx="9">London</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="2">manages</dependent>
</dep>
<dep type="nsubj">
<governor idx="2">manages</governor>
<dependent idx="1">He</dependent>
</dep>
<dep type="nummod">
<governor idx="4">aircraft</governor>
<dependent idx="3">5</dependent>
</dep>
<dep type="dobj">
<governor idx="2">manages</governor>
<dependent idx="4">aircraft</dependent>
</dep>
<dep type="cc">
<governor idx="4">aircraft</governor>
<dependent idx="5">and</dependent>
</dep>
<dep type="nmod:poss">
<governor idx="7">crew</governor>
<dependent idx="6">their</dependent>
</dep>
<dep type="conj:and">
<governor idx="4">aircraft</governor>
<dependent idx="7">crew</dependent>
</dep>
<dep type="case">
<governor idx="9">London</governor>
<dependent idx="8">in</dependent>
</dep>
<dep type="nmod:in">
<governor idx="7">crew</governor>
<dependent idx="9">London</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-ccprocessed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="2">manages</dependent>
</dep>
<dep type="nsubj">
<governor idx="2">manages</governor>
<dependent idx="1">He</dependent>
</dep>
<dep type="nummod">
<governor idx="4">aircraft</governor>
<dependent idx="3">5</dependent>
</dep>
<dep type="dobj">
<governor idx="2">manages</governor>
<dependent idx="4">aircraft</dependent>
</dep>
<dep type="cc">
<governor idx="4">aircraft</governor>
<dependent idx="5">and</dependent>
</dep>
<dep type="nmod:poss">
<governor idx="7">crew</governor>
<dependent idx="6">their</dependent>
</dep>
<dep type="dobj" extra="true">
<governor idx="2">manages</governor>
<dependent idx="7">crew</dependent>
</dep>
<dep type="conj:and">
<governor idx="4">aircraft</governor>
<dependent idx="7">crew</dependent>
</dep>
<dep type="case">
<governor idx="9">London</governor>
<dependent idx="8">in</dependent>
</dep>
<dep type="nmod:in">
<governor idx="7">crew</governor>
<dependent idx="9">London</dependent>
</dep>
</dependencies>
</sentence>
</sentences>
<coreference>
<coreference>
<mention representative="true">
<sentence>1</sentence>
<start>1</start>
<end>3</end>
<head>2</head>
<text>Jack Frost</text>
</mention>
<mention>
<sentence>2</sentence>
<start>1</start>
<end>2</end>
<head>1</head>
<text>He</text>
</mention>
</coreference>
</coreference>
</document>
</root>
Or to get the json output:
java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file test.txt -outputFormat json
And if you really need a python wrapper, see https://github.com/smilli/py-corenlp
$ cd stanford-corenlp-full-2015-12-09
$ export CLASSPATH=protobuf.jar:joda-time.jar:jollyday.jar:xom-1.2.10.jar:stanford-corenlp-3.6.0.jar:stanford-corenlp-3.6.0-models.jar:slf4j-api.jar
$ java -mx4g edu.stanford.nlp.pipeline.StanfordCoreNLPServer &
cd
$ git clone https://github.com/smilli/py-corenlp.git
$ cd py-corenlp
$ python
>>> from corenlp import StanfordCoreNLP
>>> nlp = StanfordCoreNLP('http://localhost:9000')
>>> text = ("Jack Frost works for Boeing Company. He manages 5 aircraft and their crew in London")
>>> output = nlp.annotate(text, properties={'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json'})
>>> output
{u'sentences': [{u'parse': u'SENTENCE_SKIPPED_OR_UNPARSABLE', u'index': 0, u'tokens': [{u'index': 1, u'word': u'Jack', u'lemma': u'Jack', u'after': u' ', u'pos': u'NNP', u'characterOffsetEnd': 4, u'characterOffsetBegin': 0, u'originalText': u'Jack', u'ner': u'PERSON', u'before': u''}, {u'index': 2, u'word': u'Frost', u'lemma': u'Frost', u'after': u' ', u'pos': u'NNP', u'characterOffsetEnd': 10, u'characterOffsetBegin': 5, u'originalText': u'Frost', u'ner': u'PERSON', u'before': u' '}, {u'index': 3, u'word': u'works', u'lemma': u'work', u'after': u' ', u'pos': u'VBZ', u'characterOffsetEnd': 16, u'characterOffsetBegin': 11, u'originalText': u'works', u'ner': u'O', u'before': u' '}, {u'index': 4, u'word': u'for', u'lemma': u'for', u'after': u' ', u'pos': u'IN', u'characterOffsetEnd': 20, u'characterOffsetBegin': 17, u'originalText': u'for', u'ner': u'O', u'before': u' '}, {u'index': 5, u'word': u'Boeing', u'lemma': u'Boeing', u'after': u' ', u'pos': u'NNP', u'characterOffsetEnd': 27, u'characterOffsetBegin': 21, u'originalText': u'Boeing', u'ner': u'ORGANIZATION', u'before': u' '}, {u'index': 6, u'word': u'Company', u'lemma': u'Company', u'after': u'', u'pos': u'NNP', u'characterOffsetEnd': 35, u'characterOffsetBegin': 28, u'originalText': u'Company', u'ner': u'ORGANIZATION', u'before': u' '}, {u'index': 7, u'word': u'.', u'lemma': u'.', u'after': u' ', u'pos': u'.', u'characterOffsetEnd': 36, u'characterOffsetBegin': 35, u'originalText': u'.', u'ner': u'O', u'before': u''}]}, {u'parse': u'SENTENCE_SKIPPED_OR_UNPARSABLE', u'index': 1, u'tokens': [{u'index': 1, u'word': u'He', u'lemma': u'he', u'after': u' ', u'pos': u'PRP', u'characterOffsetEnd': 39, u'characterOffsetBegin': 37, u'originalText': u'He', u'ner': u'O', u'before': u' '}, {u'index': 2, u'word': u'manages', u'lemma': u'manage', u'after': u' ', u'pos': u'VBZ', u'characterOffsetEnd': 47, u'characterOffsetBegin': 40, u'originalText': u'manages', u'ner': u'O', u'before': u' '}, {u'index': 3, u'after': u' ', u'word': u'5', u'lemma': u'5', u'normalizedNER': u'5.0', u'pos': u'CD', u'characterOffsetEnd': 49, u'characterOffsetBegin': 48, u'originalText': u'5', u'ner': u'NUMBER', u'before': u' '}, {u'index': 4, u'word': u'aircraft', u'lemma': u'aircraft', u'after': u' ', u'pos': u'NN', u'characterOffsetEnd': 58, u'characterOffsetBegin': 50, u'originalText': u'aircraft', u'ner': u'O', u'before': u' '}, {u'index': 5, u'word': u'and', u'lemma': u'and', u'after': u' ', u'pos': u'CC', u'characterOffsetEnd': 62, u'characterOffsetBegin': 59, u'originalText': u'and', u'ner': u'O', u'before': u' '}, {u'index': 6, u'word': u'their', u'lemma': u'they', u'after': u' ', u'pos': u'PRP$', u'characterOffsetEnd': 68, u'characterOffsetBegin': 63, u'originalText': u'their', u'ner': u'O', u'before': u' '}, {u'index': 7, u'word': u'crew', u'lemma': u'crew', u'after': u' ', u'pos': u'NN', u'characterOffsetEnd': 73, u'characterOffsetBegin': 69, u'originalText': u'crew', u'ner': u'O', u'before': u' '}, {u'index': 8, u'word': u'in', u'lemma': u'in', u'after': u' ', u'pos': u'IN', u'characterOffsetEnd': 76, u'characterOffsetBegin': 74, u'originalText': u'in', u'ner': u'O', u'before': u' '}, {u'index': 9, u'word': u'London', u'lemma': u'London', u'after': u'', u'pos': u'NNP', u'characterOffsetEnd': 83, u'characterOffsetBegin': 77, u'originalText': u'London', u'ner': u'LOCATION', u'before': u' '}]}]}
>>> annotated_sent0 = output['sentences'][0]
>>> for token in annotated_sent0['tokens']:
... print token['word'], token['lemma'], token['pos'], token['ner']
...
Jack Jack NNP PERSON
Frost Frost NNP PERSON
works work VBZ O
for for IN O
Boeing Boeing NNP ORGANIZATION
Company Company NNP ORGANIZATION
. . . O
Possibly this is the output you want:
>>> " ".join(token['lemma'] for token in annotated_sent0['tokens'])
Jack Frost work for Boeing Company
>>> " ".join(token['word'] for token in annotated_sent0['tokens'])
Jack Frost works for Boeing Company
If you want a wrapper that comes with NLTK, then you have to wait just a little longer until this issue is resolved ;P