3

I've followed the examples listed in the documentation here: http://wiki.apache.org/solr/Deduplication and https://cwiki.apache.org/confluence/display/solr/De-Duplication

However, when analyzing the results every signatureField gets returned like so: 0000000000000000

I can't seem to figure out why a unique signature isn't being generated.

Relevant config sections:

solrconfig.xml

 <requestHandler name="/update"
              class="solr.XmlUpdateRequestHandler">
<!-- See below for information on defining
     updateRequestProcessorChains that can be used by name
     on each Update Request
  -->

   <lst name="defaults">
     <str name="update.chain">dedupe</str>
   </lst>

</requestHandler>

...

<!-- Deduplication

   An example dedup update processor that creates the "id" field
   on the fly based on the hash code of some other fields.  This
   example has overwriteDupes set to false since we are using the
   id field as the signatureField and Solr will maintain
   uniqueness based on that anyway.

-->

 <updateRequestProcessorChain name="dedupe">
   <processor class="solr.processor.SignatureUpdateProcessorFactory">
     <bool name="enabled">true</bool>
     <str name="signatureField">signatureField</str>
     <bool name="overwriteDupes">false</bool>
     <str name="fields">name,features,cat</str>
     <str name="signatureClass">solr.processor.Lookup3Signature</str>
   </processor>
   <processor class="solr.LogUpdateProcessorFactory" />
   <processor class="solr.RunUpdateProcessorFactory" />
 </updateRequestProcessorChain>

schema.xml

     <fields>
   <!-- Valid attributes for fields:
     name: mandatory - the name for the field
     type: mandatory - the name of a previously defined type from the
       <types> section
     indexed: true if this field should be indexed (searchable or sortable)
     stored: true if this field should be retrievable
     multiValued: true if this field may contain multiple values per document
     omitNorms: (expert) set to true to omit the norms associated with
       this field (this disables length normalization and index-time
       boosting for the field, and saves some memory).  Only full-text
       fields or fields that need an index-time boost need norms.
       Norms are omitted for primitive (non-analyzed) types by default.
     termVectors: [false] set to true to store the term vector for a
       given field.
       When using MoreLikeThis, fields used for similarity should be
       stored for best performance.
     termPositions: Store position information with the term vector.
       This will increase storage costs.
     termOffsets: Store offset information with the term vector. This
       will increase storage costs.
     default: a value that should be used if no value is specified
       when adding a document.
   -->

   <field name="signatureField" type="string" stored="true" indexed="true" multiValued="false" />
   <field name="id" type="string" indexed="true" stored="true" required="true" />
   <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="text_general" indexed="true" stored="true"/>
   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
   <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/>
... etc

I'm wondering if anyone can steer me in the right direction?

Ryan Price
  • 315
  • 6
  • 17

0 Answers0