3

I'm having trouble using Solr's DataImportHandler to index an XML file that is in Solr's add Schema, I am able to index it if I send it via a HTTP update request.

The problem is with the nested entities in the document. I am not being able to index them correctly.

Here is an example of the XML file to be indexed:

<add commitWithin="5000">
  <doc>
    <field name="id">1</field>
    <field name="type">Document</field>
    <doc>
      <field name="id">1_1</field>
      <field name="nested_status">Nested</field>
    </doc>
    <field name="isParent">true</field>
  </doc>
</add>

And my data-config.xml:

<dataConfig>
  <dataSource name="Test_XML"
              type="FileDataSource"
              encoding="ISO_8859_1"/>
  <document>
    <entity name="doc"
                processor="XPathEntityProcessor"
                stream="true"
                useSolrAddSchema="true"
                url="LOCATION\useSolrAddSchema_test.xml">
      <entity name="nested_doc"
              processor="XPathEntityProcessor"
              stream="true"
              useSolrAddSchema="true"
              child="true"
              url=LOCATION\useSolrAddSchema_test.xml">
      </entity>
    </entity>
  </document>
</dataConfig>

And the Debug Response is:

{
  "responseHeader": {
    "status": 0,
    "QTime": 162
  },
  "initArgs": [
    "defaults",
    [
      "config",
      "data-config.xml"
    ]
  ],
  "command": "full-import",
  "mode": "debug",
  "documents": [
    {
      "isParent": [
        "true"
      ],
      "id": [
        "1"
      ],
      "type": [
        "Document"
      ],
      "_version_": [
        1549343328462438400
      ],
      "_root_": [
        "1"
      ]
    }
  ],
  "verbose-output": [],
  "status": "idle",
  "importResponse": "",
  "statusMessages": {
    "Total Requests made to DataSource": "0",
    "Total Rows Fetched": "2",
    "Total Documents Processed": "1",
    "Total Documents Skipped": "0",
    "Full Dump Started": "2016-10-27 11:48:59",
    "": "Indexing completed. Added/Updated: 1 documents. Deleted 0 documents.",
    "Committed": "2016-10-27 11:48:59",
    "Time taken": "0:0:0.149"
  }
}

So it is ignoring the nested document, and when I query to get all indexed documents I get two copies of the outside document:

{
  "responseHeader":{
    "status":0,
    "QTime":0,
    "params":{
      "q":"*:*",
      "indent":"on",
      "wt":"json",
      "_":"1477568715268"}},
  "response":{"numFound":2,"start":0,"docs":[
      {
        "id":"1",
        "type":"Document"},
      {
        "id":"1",
        "type":"Document",
        "_version_":1549343328462438400}]
  }}

I have looked into this question where the accepted answer says it is not possible to have nested entities, but since Solr 5.1 it should be possible with the child='True' atribute.

I'm currently working with Solr version 6.2.1, but would prefer a solution that was compatible with older versions.

Community
  • 1
  • 1
João Almeida
  • 4,487
  • 2
  • 19
  • 35

0 Answers0