0

I am using Apache Solr 3.6.0. I have indexed a file with this command:

 curl "http://localhost:8983/solr/update/extract?stream.file=/home/Desktop/DOCUMENTS/x.pdf&stream.contentType=application/pdf&literal.id=DOC_N&commit=true"

when I search for the text for example:

"http://localhost:8983/solr/select/?q=Getting+Started&version=2.2&start=0&rows=10&indent=on"

the result return is:

<response>
    <lst name="responseHeader">
        <int name="status">0</int>
        <int name="QTime">12</int>
        <lst name="params">
            <str name="indent">on</str>
            <str name="start">0</str>
            <str name="q">Getting Started</str>
            <str name="version">2.2</str>
            <str name="rows">10</str>
        </lst>
    </lst>
    <result name="response" numFound="3" start="0">
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">doc2</str>
        </doc>
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">1</str>
        </doc>
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">DOC_N</str>
        </doc>
    </result>
</response>

What I am getting from this is that the result was found in the pdf. Now I want to implement a search like: it will return some text before & after that word that I have search from the pdf that was indexed. please help.

here is my solrconfig.xml

<?xml version="1.0" encoding="UTF-8" ?>
<config>
    <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
    <luceneMatchVersion>LUCENE_36</luceneMatchVersion>
    <lib dir="../../dist/" regex="apache-solr-cell-\d.*\.jar" />
    <lib dir="../../contrib/extraction/lib" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
    <lib dir="../../contrib/clustering/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-dataimporthandler-\d.*\.jar" />
    <lib dir="../../contrib/dataimporthandler/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-langid-\d.*\.jar" />
    <lib dir="../../contrib/langid/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-velocity-\d.*\.jar" />
    <lib dir="../../contrib/velocity/lib" regex=".*\.jar" />
    <dataDir>${solr.data.dir:}</dataDir>
    <directoryFactory name="DirectoryFactory" 
                        class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
    <indexConfig>
    </indexConfig>
    <jmx />
    <query>
        <maxBooleanClauses>1024</maxBooleanClauses>
        <filterCache class="solr.FastLRUCache"
                     size="512"
                     initialSize="512"
                     autowarmCount="0"/>
        <queryResultCache class="solr.LRUCache"
                          size="512"
                          initialSize="512"
                          autowarmCount="0"/>
        <documentCache class="solr.LRUCache"
                       size="512"
                       initialSize="512"
                       autowarmCount="0"/>
        <enableLazyFieldLoading>true</enableLazyFieldLoading>
        <queryResultWindowSize>20</queryResultWindowSize>
        <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
        <listener event="newSearcher" class="solr.QuerySenderListener">
            <arr name="queries">
            </arr>
        </listener>
        <listener event="firstSearcher" class="solr.QuerySenderListener">
            <arr name="queries">
                <lst>
                    <str name="q">static firstSearcher warming in solrconfig.xml</str>
                </lst>
            </arr>
        </listener>
        <useColdSearcher>false</useColdSearcher>
        <maxWarmingSearchers>2</maxWarmingSearchers>
    </query>
    <requestDispatcher>
        <requestParsers enableRemoteStreaming="true" 
                        multipartUploadLimitInKB="2048000" />
        <httpCaching never304="true" />
    </requestDispatcher>
    <requestHandler name="/select" class="solr.SearchHandler">
        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <int name="rows">10</int>
            <str name="df">text</str>
        </lst>
    </requestHandler>
    <requestHandler name="/browse" class="solr.SearchHandler">
        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <!-- VelocityResponseWriter settings -->
            <str name="wt">velocity</str>
            <str name="v.template">browse</str>
            <str name="v.layout">layout</str>
            <str name="title">Solritas</str>
            <str name="df">text</str>
            <str name="defType">edismax</str>
            <str name="q.alt">*:*</str>
            <str name="rows">10</str>
            <str name="fl">*,score</str>
            <str name="mlt.qf">
                text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
            </str>
            <str name="mlt.fl">text,features,name,sku,id,manu,cat</str>
            <int name="mlt.count">3</int>
            <str name="qf">
                text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
            </str>
            <str name="facet">on</str>
            <str name="facet.field">cat</str>
            <str name="facet.field">manu_exact</str>
            <str name="facet.query">ipod</str>
            <str name="facet.query">GB</str>
            <str name="facet.mincount">1</str>
            <str name="facet.pivot">cat,inStock</str>
            <str name="facet.range.other">after</str>
            <str name="facet.range">price</str>
            <int name="f.price.facet.range.start">0</int>
            <int name="f.price.facet.range.end">600</int>
            <int name="f.price.facet.range.gap">50</int>
            <str name="facet.range">popularity</str>
            <int name="f.popularity.facet.range.start">0</int>
            <int name="f.popularity.facet.range.end">10</int>
            <int name="f.popularity.facet.range.gap">3</int>
            <str name="facet.range">manufacturedate_dt</str>
            <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
            <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
            <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
            <str name="f.manufacturedate_dt.facet.range.other">before</str>
            <str name="f.manufacturedate_dt.facet.range.other">after</str>
            <requestHandler name="/update/javabin" 
                            class="solr.BinaryUpdateRequestHandler" />
            <requestHandler name="/update/csv" 
                            class="solr.CSVRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/update/json" 
                            class="solr.JsonUpdateRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/update/extract" 
                            startup="lazy"
                            class="solr.extraction.ExtractingRequestHandler" >
                <lst name="defaults">
                    <!-- All the main content goes into "text"... if you need to return
                    the extracted text or do highlighting, use a stored field. -->
                    <str name="fmap.content">text</str>
                    <str name="lowernames">true</str>
                    <str name="uprefix">ignored_</str>
                    <!-- capture link hrefs but ignore div attributes -->
                    <str name="captureAttr">true</str>
                    <str name="fmap.a">links</str>
                    <str name="fmap.div">ignored_</str>
                </lst>
            </requestHandler>
            <requestHandler name="/update/xslt"
                            startup="lazy"
                            class="solr.XsltUpdateRequestHandler"/>
            <requestHandler name="/analysis/field" 
                            startup="lazy"
                            class="solr.FieldAnalysisRequestHandler" />
            <requestHandler name="/analysis/document" 
                            class="solr.DocumentAnalysisRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/admin/" 
                            class="solr.admin.AdminHandlers" />
            <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
                <lst name="invariants">
                    <str name="q">solrpingquery</str>
                </lst>
                <lst name="defaults">
                    <str name="echoParams">all</str>
                </lst>
            </requestHandler>
            <!-- Echo the request contents back to the client -->
            <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
                <lst name="defaults">
                    <str name="echoParams">explicit</str> 
                    <str name="echoHandler">true</str>
                </lst>
            </requestHandler>
            <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
                <str name="queryAnalyzerFieldType">textSpell</str>
                <lst name="spellchecker">
                    <str name="name">default</str>
                    <str name="field">name</str>
                    <str name="spellcheckIndexDir">spellchecker</str>
                    <!-- uncomment this to require terms to occur in 1% of the documents 
                       in order to be included in the dictionary
                    -->
                </lst>
            </searchComponent>
            <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="df">text</str>
                    <str name="spellcheck.onlyMorePopular">false</str>
                    <str name="spellcheck.extendedResults">false</str>
                    <str name="spellcheck.count">1</str>
                </lst>
                <arr name="last-components">
                    <str>spellcheck</str>
                </arr>
            </requestHandler>
            <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
            <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="df">text</str>
                    <bool name="tv">true</bool>
                </lst>
                <arr name="last-components">
                    <str>tvComponent</str>
                </arr>
            </requestHandler>
            <searchComponent name="clustering" 
                             enable="${solr.clustering.enabled:false}"
                             class="solr.clustering.ClusteringComponent" >
                <!-- Declare an engine -->
                <lst name="engine">
                    <!-- The name, only one can be named "default" -->
                    <str name="name">default</str>
                    <!-- Class name of Carrot2 clustering algorithm. 
                       Currently available algorithms are:
                       * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
                       * org.carrot2.clustering.stc.STCClusteringAlgorithm
                       * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
                       See http://project.carrot2.org/algorithms.html for the
                       algorithm's characteristics.
                    -->
                    <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
                    <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
                    <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
                    <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
                </lst>
                <lst name="engine">
                    <str name="name">stc</str>
                    <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
                </lst>
            </searchComponent>
            <!-- A request handler for demonstrating the clustering component
           This is purely as an example.
               In reality you will likely want to add the component to your 
               already specified request handlers. 
            -->
            <requestHandler name="/clustering"
                            startup="lazy"
                            enable="${solr.clustering.enabled:false}"
                            class="solr.SearchHandler">
                <lst name="defaults">
                    <bool name="clustering">true</bool>
                    <str name="clustering.engine">default</str>
                    <bool name="clustering.results">true</bool>
                    <!-- The title field -->
                    <str name="carrot.title">name</str>
                    <str name="carrot.url">id</str>
                    <!-- The field to cluster on -->
                    <str name="carrot.snippet">features</str>
                    <!-- produce summaries -->
                    <bool name="carrot.produceSummary">true</bool>
                    <bool name="carrot.outputSubClusters">false</bool>
                    <str name="df">text</str>
                    <str name="defType">edismax</str>
                    <str name="qf">
                        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
                    </str>
                    <str name="q.alt">*:*</str>
                    <str name="rows">10</str>
                    <str name="fl">*,score</str>
                </lst>     
                <arr name="last-components">
                    <str>clustering</str>
                </arr>
            </requestHandler>
            <searchComponent name="terms" class="solr.TermsComponent"/>
            <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <bool name="terms">true</bool>
                </lst>     
                <arr name="components">
                    <str>terms</str>
                </arr>
            </requestHandler>
            <searchComponent name="elevator" class="solr.QueryElevationComponent" >
                <!-- pick a fieldType to analyze queries -->
                <str name="queryFieldType">string</str>
                <str name="config-file">elevate.xml</str>
            </searchComponent>
            <!-- A request handler for demonstrating the elevator component -->
            <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="echoParams">explicit</str>
                    <str name="df">text</str>
                </lst>
                <arr name="last-components">
                    <str>elevator</str>
                </arr>
            </requestHandler>
            <searchComponent class="solr.HighlightComponent" name="highlight">
                <highlighting>
                    <!-- Configure the standard fragmenter -->
                    <!-- This could most likely be commented out in the "default" case -->
                    <fragmenter name="gap" 
                                default="true"
                                class="solr.highlight.GapFragmenter">
                        <lst name="defaults">
                            <int name="hl.fragsize">100</int>
                        </lst>
                    </fragmenter>
                    <!-- A regular-expression-based fragmenter 
                       (for sentence extraction) 
                    -->
                    <fragmenter name="regex" 
                                class="solr.highlight.RegexFragmenter">
                        <lst name="defaults">
                            <!-- slightly smaller fragsizes work better because of slop -->
                            <int name="hl.fragsize">70</int>
                            <!-- allow 50% slop on fragment sizes -->
                            <float name="hl.regex.slop">0.5</float>
                            <!-- a basic sentence pattern -->
                            <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
                        </lst>
                    </fragmenter>
                    <!-- Configure the standard formatter -->
                    <formatter name="html" 
                               default="true"
                               class="solr.highlight.HtmlFormatter">
                        <lst name="defaults">
                            <str name="hl.simple.pre"><![CDATA[<em>]]></str>
                            <str name="hl.simple.post"><![CDATA[</em>]]></str>
                        </lst>
                    </formatter>
                    <encoder name="html" 
                             class="solr.highlight.HtmlEncoder" />
                    <fragListBuilder name="simple" 
                                     default="true"
                                     class="solr.highlight.SimpleFragListBuilder"/>
                    <!-- Configure the single fragListBuilder -->
                    <fragListBuilder name="single" 
                                     class="solr.highlight.SingleFragListBuilder"/>
                    <!-- default tag FragmentsBuilder -->
                    <fragmentsBuilder name="default" 
                                      default="true"
                                      class="solr.highlight.ScoreOrderFragmentsBuilder">
                        <!-- 
                        <lst name="defaults">
                          <str name="hl.multiValuedSeparatorChar">/</str>
                        </lst>
                        -->
                    </fragmentsBuilder>
                    <!-- multi-colored tag FragmentsBuilder -->
                    <fragmentsBuilder name="colored" 
                                      class="solr.highlight.ScoreOrderFragmentsBuilder">
                        <lst name="defaults">
                            <str name="hl.tag.pre"><![CDATA[
                   <b style="background:yellow">,<b style="background:lawgreen">,
                   <b style="background:aquamarine">,<b style="background:magenta">,
                   <b style="background:palegreen">,<b style="background:coral">,
                   <b style="background:wheat">,<b style="background:khaki">,
                   <b style="background:lime">,<b style="background:deepskyblue">]]></str>
                            <str name="hl.tag.post"><![CDATA[</b>]]></str>
                        </lst>
                    </fragmentsBuilder>
                    <boundaryScanner name="default" 
                                     default="true"
                                     class="solr.highlight.SimpleBoundaryScanner">
                        <lst name="defaults">
                            <str name="hl.bs.maxScan">10</str>
                            <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
                        </lst>
                    </boundaryScanner>
                    <boundaryScanner name="breakIterator" 
                                     class="solr.highlight.BreakIteratorBoundaryScanner">
                        <lst name="defaults">
                            <str name="hl.bs.type">WORD</str>
                            <str name="hl.bs.language">en</str>
                            <str name="hl.bs.country">US</str>
                        </lst>
                    </boundaryScanner>
                </highlighting>
            </searchComponent>
            <queryResponseWriter name="json" class="solr.JSONResponseWriter">
                <str name="content-type">text/plain; charset=UTF-8</str>
            </queryResponseWriter>
            <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>
            <!-- XSLT response writer transforms the XML output by any xslt file found
               in Solr's conf/xslt directory.  Changes to xslt files are checked for
               every xsltCacheLifetimeSeconds.  
            -->
            <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
                <int name="xsltCacheLifetimeSeconds">5</int>
            </queryResponseWriter>
            <!-- Query Parsers
           http://wiki.apache.org/solr/SolrQuerySyntax
               Multiple QParserPlugins can be registered by name, and then
               used in either the "defType" param for the QueryComponent (used
               by SearchHandler) or in LocalParams
            -->
            <!-- example of registering a query parser -->
            <!--
             <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
            -->
            <admin>
                <defaultQuery>*:*</defaultQuery>
                <!-- configure a healthcheck file for servers behind a
                   loadbalancer 
                -->
                <!--
                 <healthcheck type="file">server-enabled</healthcheck>
                -->
            </admin>
            <searchComponent name="suggest_full" class="solr.SpellCheckComponent">
                <str name="queryAnalyzerFieldType">suggestTextFull</str>
                <lst name="spellchecker">
                    <str name="name">suggest_full</str>
                    <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
                    <str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
                    <str name="field">text_suggest_full</str>
                    <str name="fieldType">suggestTextFull</str>
                </lst>
            </searchComponent>
            <requestHandler name="/suggest_full" class="org.apache.solr.handler.component.SearchHandler">
                <lst name="defaults">
                    <str name="echoParams">explicit</str>
                    <str name="spellcheck">true</str>
                    <str name="spellcheck.dictionary">suggest_full</str>
                    <str name="spellcheck.count">10</str>
                    <str name="spellcheck.onlyMorePopular">true</str>
                </lst>
                <arr name="last-components">
                    <str>suggest_full</str>
                </arr>
            </requestHandler>
            <requestHandler name="edismax" class="solr.SearchHandler" default="true">
                <lst name="defaults">
                    <str name="defType">edismax</str>
                    <str name="echoParams">explicit</str>
                    <float name="tie">0.1</float>
                    <str name="fl">keywords</str>
                    <str name="mm">1</str>
                    <str name="qf">kw_stopped^1.0 kw_phrases^5.0</str>
                    <str name="pf">kw_phrases^50.0</str>
                    <int name="ps">3</int>
                    <int name="qs">3</int>
                    <str name="q.alt">*:*</str>
                </lst>
            </requestHandler>
            <highlighting>
                <!-- Configure the standard fragmenter -->
                <!-- This could most likely be commented out in the "default" case -->
                <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
                    <lst name="defaults">
                        <int name="hl.fragsize">100</int>
                    </lst>
                </fragmenter>
                <!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
                <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
                    <lst name="defaults">
                        <!-- slightly smaller fragsizes work better because of slop -->
                        <int name="hl.fragsize">70</int>
                        <!-- allow 50% slop on fragment sizes -->
                        <float name="hl.regex.slop">0.5</float>
                        <!-- a basic sentence pattern -->
                        <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
                    </lst>
                </fragmenter>
                <!-- Configure the standard formatter -->
                <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
                    <lst name="defaults">
                        <str name="hl.simple.pre"><![CDATA[<em>]]></str>
                        <str name="hl.simple.post"><![CDATA[</em>]]></str>
                    </lst>
                </formatter>
            </highlighting>
            <!-- multi-colored tag FragmentsBuilder -->
            <fragmentsBuilder name="colored" class="org.apache.solr.highlight.ScoreOrderFragmentsBuilder">
                <lst name="defaults">
                    <str name="hl.tag.pre"><![CDATA[
             <b style="background:yellow">,<b style="background:lawgreen">,
             <b style="background:aquamarine">,<b style="background:magenta">,
             <b style="background:palegreen">,<b style="background:coral">,
             <b style="background:wheat">,<b style="background:khaki">,
             <b style="background:lime">,<b style="background:deepskyblue">]]></str>
                    <str name="hl.tag.post"><![CDATA[</b>]]></str>
                </lst>
            </fragmentsBuilder>
            <boundaryScanner name="breakIterator" class="solr.highlight.BreakIteratorBoundaryScanner">
                <lst name="defaults">
                    <str name="hl.bs.type">WORD</str>
                    <str name="hl.bs.language">en</str>
                    <str name="hl.bs.country">US</str>
                </lst>
            </boundaryScanner>
            <boundaryScanner name="simple" class="solr.highlight.SimpleBoundaryScanner" default="true">
                <lst name="defaults">
                    <str name="hl.bs.maxScan">10</str>
                    <str name="hl.bs.chars">.,!?\t\n</str>
                </lst>
            </boundaryScanner>
</config>
Nicomedes E.
  • 1,326
  • 5
  • 18
  • 27
Asif S. Abid
  • 57
  • 1
  • 14

1 Answers1

0

Solr highlighting provides the option to highlight the searched term and returns text before and after the match.

You can check for the Solr highlight feature.
Check out the Highlighting Parameters to configure it.

Jayendra
  • 52,349
  • 4
  • 80
  • 90
  • Hope you have done `hl=true` and `hl.fl=text` (or whatever field). And does `q=*:*` reveal the content of the `text` field? Otherwise your command has not worked. – Jesvin Jose Jun 15 '12 at 11:54
  • What request handler are you using and on what field are you searching upon. As @aitchnyu mentioned, please pass the highlighting parameters in request and check the results. – Jayendra Jun 15 '12 at 12:03