I am trying to use NER in Solr(SolrCloud) 7.3 but I am a little bit missing some documentation.
What I have found:
What I have done:
Parts of solrconfig.xml
<updateRequestProcessorChain name="multiple-extract">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">opennlp/en-ner-person.bin</str>
<str name="analyzerFieldType">text_opennlp</str>
<str name="source">description_en</str>
<str name="dest">content</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<requestHandler name="/myupdate " class="solr.UpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">multiple-extract</str>
</lst>
</requestHandler>
Parts of management schema
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="content" type="text_opennlp" indexed="true" termOffsets="true" stored="true" termPayloads="true" termPositions="true" docValues="false" termVectors="true" multiValued="true" required="true"/>
<field name="content_pos" type="text_opennlp_pos" indexed="true" termOffsets="true" stored="true" termPayloads="true" termPositions="true" docValues="false" termVectors="true" multiValued="true" required="true"/>
<field name="description_en" type="text_en" indexed="true" stored="true"/>
<field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<fieldType name="text_opennlp" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory" tokenizerModel="opennlp/en-token.bin" sentenceModel="opennlp/en-sent.bin"/>
</analyzer>
</fieldType>
<fieldType name="text_opennlp_pos" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory" tokenizerModel="opennlp/en-token.bin" sentenceModel="opennlp/en-sent.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="opennlp/en-pos-maxent.bin"/>
<filter class="solr.TypeAsPayloadFilterFactory"/>
</analyzer>
</fieldType>
My assumption
As I assume when I will send data using multiple-extract update processor chain it will be extracted form <str name="source">description_en</str>
and put to
<str name="dest">content</str>
. But I don't even see this behavior.
My tests
request for data insert
POST http://localhost:8983/solr/numberplate/update?version=2.2&wt=xml&update.chain=multiple-extract
<add><doc><field name="id">0e5c7067-9cf0-445b-8374-4bf25484420c</field><field name="description_en">This is Steve Jobs 4 </field><field name="content_pos">This is text 4</field><field name="content"></field></doc></add>
response for data insert
<?xml version="1.0" encoding="UTF-8"?>
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">20</int>
</lst>
</response>
request for data select
http://localhost:8983/solr/numberplate/select?q=:
response for data select
{
"responseHeader": {
"zkConnected": true,
"status": 0,
"QTime": 22,
"params": {
"q": "*:*"
}
},
"response": {
"numFound": 5,
"start": 0,
"maxScore": 1,
"docs": [
{
"id": "0e5c7067-9cf0-445b-8374-4bf25484420c",
"description_en": "This is Steve Jobs 4 ",
"content_pos": [
"This is text 4"
],
"content": [
""
],
"_version_": 1598004417210089500
}
]
}
}
Question
What have I done wrong? Maybe NER in Solr 7.3 working in some other magical way or maybe I don't understand something correctly?