0

I am facing a problem while setting up the Lucene search engine in umbraco. I am trying to make a search on the data stored in default index created by Umbraco. The method for searching is below:

        private DictionaryResult GetRowContent(
        Lucene.Net.Highlight.Highlighter highlighter,
        Lucene.Net.Analysis.Standard.StandardAnalyzer analyzer
        ,Lucene.Net.Documents.Document doc1, string criteria)
    {
        JavaScriptSerializer jsScriptSerializer = new JavaScriptSerializer();
        DictionaryResult controls = new DictionaryResult();
        Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new StringReader(doc1.Get("bodyContent")));
        dynamic rowContentHtmlDocument = JObject.Parse(((JValue)doc1.Get("bodyContent")).ToString(CultureInfo.CurrentCulture));
        foreach (dynamic section in rowContentHtmlDocument.sections)
        {
            foreach (var row in section.rows)
            {
                foreach (var area in row.areas)
                {
                    foreach (var control in area.controls)
                    {
                        if (control != null && control.editor != null) // && control.editor.view != null)
                        {
                            JObject rowContentHtml = null;
                            try
                            {
                                rowContentHtml = JObject.Parse(((JContainer)control)["value"].ToString());
                            }
                            catch (Exception e)
                            {
                            }
                            if (rowContentHtml != null)
                            {
                                try
                                {
                                    var macroParamsDictionary = JObject.Parse(((JContainer)rowContentHtml)["macroParamsDictionary"].ToString());
                                    var documentText = macroParamsDictionary.GetValue("dokument");
                                    if (documentText != null)
                                    {
                                        var document = documentText.ToString().Replace(""", "\"");
                                        dynamic documents = jsScriptSerializer.Deserialize<dynamic>(document);
                                        foreach (Dictionary<string, object> doc in documents)
                                        {
                                            if (doc.ContainsKey("FileName") && doc.ContainsKey("DocumentId"))
                                            {
                                                if (doc["FileName"].ToString().Length > 0 && 
                                                    doc["FileName"].ToString().ToLower().Contains(criteria.ToLower()))
                                                {
                                                    controls.Add(new RowResult()
                                                    {
                                                        Type = 0,
                                                        Object = new Document()
                                                        {
                                                            DocumentName = doc["FileName"].ToString(),//highlighter.GetBestFragments(stream, doc["FileName"].ToString(), 1, "..."),
                                                            DocId = Guid.Parse(doc["DocumentId"].ToString())
                                                        } // StringBuilder(@"<a href=" + Url.Action("DownloadDocument", "Document", new { DocumentId = doc["DocumentId"] }) + "> " + @doc["FileName"] + "</a>").ToString()
                                                    }
                                                    );
                                                }
                                            }
                                        }
                                    }
                                }
                                catch (Exception e)
                                {
                                }
                            }
                            else
                            {
                                var text = HtmlRemoval.StripTagsRegex(((JContainer)control)["value"].ToString()).Replace("ë", "e").Replace("ç", "c");
                                var textResultFiltered =  highlighter.GetBestFragments(stream,doc1.Get("bodyContent"), 5, "...");
                                controls.Add(new RowResult()
                                {
                                    Type = 1,
                                    Object = textResultFiltered
                                });
                            }
                        }
                    }
                }
            }
        }
        return controls;
    }

Here I am trying to filter macro documents from simple html content and render differently. But at the end to this part

var text = HtmlRemoval.StripTagsRegex(((JContainer)control)["value"].ToString()).Replace("ë", "e").Replace("ç", "c");
                            var textResultFiltered =  highlighter.GetBestFragments(stream,doc1.Get("bodyContent"), 5, "...");
                            controls.Add(new RowResult()
                            {
                                Type = 1,
                                Object = textResultFiltered
                            });

it is including the macro in searching. As the result I get the documents property but the html content hightlighted has the macro content like below:

6th Edition V413HAV.pdf","FileContent"... Framework 6th Edition V413HAV.pdf","... with Java 8 - 1st Edition (2015) - Copy.pdf"... 4.5 Framework 6th Edition V413HAV.pdf","... And The NET 4.5 Framework 6th Edition V413HAV.pdf" which is coming from Json data of the macro. Any idea how to exclude the macros from searching or to customize the hmtl content not to search on specific macro ? Thanks in advance. 

I am refering to this link to create the Hightlighter etc... Link to Lucene example

Any idea how to prevent searching on macros or exclude them from the highlighted content ?

Community
  • 1
  • 1
Rey
  • 3,663
  • 3
  • 32
  • 55

3 Answers3

0

That looks WAY too complicated to be right, if you're just making a regular search. Do you know that Umbraco has its own Lucene "version" called Examine? It's built into Umbraco and doesn't require much if any setting up to get a standard search running: https://our.umbraco.org/documentation/reference/searching/examine/

I've never seen macro or JSON markup in my search results using Examine, so maybe try it out?

Jannik Anker
  • 3,320
  • 1
  • 13
  • 21
0

You can use easily Examine. You only need to select the search provider you want (config/ExamineSettings.config), which allow you to select if you want to avoid unpublished and protected content. Then you only need to do something like the next piece of code where you can choice the fields you want to search or the Dact Types you don't want to avoind for example.

string term = "test"

var criteria = ExamineManager.Instance.SearchProviderCollection["ExternalSearcher"].CreateSearchCriteria();
var crawl = criteria.GroupedOr(new string[] { "nodeName", "pageTitle", "metaDescription", "metaKeywords" }, term)
                .Not().Field("nodeTypeAlias", "GlobalSettings")
                .Not().Field("nodeTypeAlias", "Error")
                .Not().Field("nodeTypeAlias", "File")
                .Not().Field("nodeTypeAlias", "Folder")
                .Not().Field("nodeTypeAlias", "Image")
                .Not().Field("excludeFromSearch", "1")
                .Compile();

 ISearchResults SearchResults = ExamineManager.Instance
                .SearchProviderCollection["ExternalSearcher"]
                .Search(crawl);

 IList<JsonSearchResult> results = new List<JsonSearchResult>();

Hope this make sense.

Lucio
  • 1
  • 2
  • Hi Lucio, thanks for your response. I am wondering how to make the highlighting process. If you could give me a real example ?! – Rey Sep 13 '16 at 12:23
0

I tried using Examine as well as below:

SearchQuery = string.Format("+{0}:{1}~", SearchField, criteria);
var Criteria = ExamineManager.Instance
                    .SearchProviderCollection["ExternalSearcher"]
                    .CreateSearchCriteria();
var crawl = Criteria.GroupedOr(new string[] { "bodyContent", "nodeName" }, criteria)
                    .Not()
                    .Field("umbracoNaviHide", "1")
                    .Not()
                    .Field("nodeTypeAlias", "Image")
                    .Compile();
IEnumerable<Examine.SearchResult> SearchResults1 = ExamineManager.Instance
                    .SearchProviderCollection["ExternalSearcher"]
                    .Search(crawl);

I used two methods to highlight the text as below but these methods where not quite efficient!!! I had some links without any text highlighted at all.

        public string GetHighlight(string value, string highlightField, BaseLuceneSearcher searcher, string luceneRawQuery)
    {
        var query = GetQueryParser(highlightField).Parse(luceneRawQuery);
        var scorer = new QueryScorer(searcher.GetSearcher().Rewrite(query));

        var highlighter = new Highlighter(HighlightFormatter, scorer);

        var tokenStream = HighlightAnalyzer.TokenStream(highlightField, new StringReader(value));
        return highlighter.GetBestFragments(tokenStream, value, MaxNumHighlights, Separator);
    }
    protected QueryParser GetQueryParser(string highlightField)
    {
        if (!QueryParsers.ToString().Contains(highlightField))
        {
            var temp = new QueryParser(_luceneVersion, highlightField, HighlightAnalyzer);
            return temp;
        }
        return null;
    }

If you have any sample with highlighting in Examine which is quite efficient I would appreciate it..

Rey
  • 3,663
  • 3
  • 32
  • 55
  • I haven't tried Examine with highlighting, so I'm drawing a bit of a blank I'm afraid. But getting links without highlighted text - couldn't that be a CSS issue somewhere? – Jannik Anker Sep 13 '16 at 13:28
  • The problem I have is only about the highlighting paragraph below link like google does. The links are ok. Thanks anyway for your help :) – Rey Sep 13 '16 at 13:55