6

I am considering RavenDb to implement an 'advanced faceted search' scenario.
I have to deal with a complex hierarchical taxonomy and shared facets across the different branches of the tree while supporting full text search and all other basic features.

Is there any resource out there that document how to do this using the RavenDb API?

Insanely complex paper on the subject: Beyond Basic Faceted Search
Solr's way: HierarchicalFaceting

maxbeaudoin
  • 6,546
  • 5
  • 38
  • 53

3 Answers3

5

Finally..

using System.Collections.Generic;
using System.Linq;
using NUnit.Framework;
using Raven.Abstractions.Data;
using Raven.Client;
using Raven.Client.Document;
using Raven.Client.Indexes;
using Raven.Client.Linq;

namespace Prototype.Search.Tests
{
    [TestFixture]
    public class HierarchicalFaceting
    {
        //
        // Document definition
        //
        public class Doc
        {
            public Doc()
            {
                Categories = new List<string>();
            }

            public int Id { get; set; }
            public List<string> Categories { get; set; }
        }

        //
        // Data sample
        //
        public IEnumerable<Doc>  GetDocs()
        {
            yield return new Doc { Id = 1, Categories = new List<string> { "0/NonFic", "1/NonFic/Law"} };
            yield return new Doc { Id = 2, Categories = new List<string> { "0/NonFic", "1/NonFic/Sci" } };
            yield return new Doc { Id = 3, Categories = new List<string> { "0/NonFic", "1/NonFic/Hist", "1/NonFic/Sci", "2/NonFic/Sci/Phys" } };
        }

        //
        // The index
        //
        public class DocByCategory : AbstractIndexCreationTask<Doc, DocByCategory.ReduceResult>
        {
            public class ReduceResult
            {
                public string Category { get; set; }
            }

            public DocByCategory()
            {
                Map = docs =>
                      from d in docs
                      from c in d.Categories
                      select new
                                 {
                                     Category = c
                                 };
            }
        }

        //
        // FacetSetup
        //
        public FacetSetup GetDocFacetSetup()
        {
            return new FacetSetup
                       {
                           Id = "facets/Doc",
                           Facets = new List<Facet>
                                        {
                                            new Facet
                                                {
                                                    Name = "Category"
                                                }
                                        }
                       };
        }

        [SetUp]
        public void SetupDb()
        {
            IDocumentStore store = new DocumentStore()
            {
                Url = "http://localhost:8080"
            };
            store.Initialize();
            IndexCreation.CreateIndexes(typeof(HierarchicalFaceting).Assembly, store);

            var session = store.OpenSession();
            session.Store(GetDocFacetSetup());
            session.SaveChanges();

            store.Dispose();
        }

        [Test]
        [Ignore]
        public void DeleteAll()
        {
            IDocumentStore store = new DocumentStore()
            {
                Url = "http://localhost:8080"
            };
            store.Initialize();

            store.DatabaseCommands.DeleteIndex("Raven/DocByCategory");
            store.DatabaseCommands.DeleteByIndex("Raven/DocumentsByEntityName", new IndexQuery());

            store.Dispose();
        }

        [Test]
        [Ignore]
        public void StoreDocs()
        {
            IDocumentStore store = new DocumentStore()
            {
                Url = "http://localhost:8080"
            };
            store.Initialize();

            var session = store.OpenSession();

            foreach (var doc in GetDocs())
            {
                session.Store(doc);
            }

            session.SaveChanges();
            session.Dispose();
            store.Dispose();
        }

        [Test]
        public void QueryDocsByCategory()
        {
            IDocumentStore store = new DocumentStore()
            {
                Url = "http://localhost:8080"
            };
            store.Initialize();

            var session = store.OpenSession();

            var q = session.Query<DocByCategory.ReduceResult, DocByCategory>()
                .Where(d => d.Category == "1/NonFic/Sci")
                .As<Doc>();

            var results = q.ToList();
            var facetResults = q.ToFacets("facets/Doc").ToList();

            session.Dispose();
            store.Dispose();
        }

        [Test]
        public void GetFacets()
        {
            IDocumentStore store = new DocumentStore()
            {
                Url = "http://localhost:8080"
            };
            store.Initialize();

            var session = store.OpenSession();

            var q = session.Query<DocByCategory.ReduceResult, DocByCategory>()
                .Where(d => d.Category.StartsWith("1/NonFic"))
                .As<Doc>();

            var results = q.ToList();
            var facetResults = q.ToFacets("facets/Doc").ToList();

            session.Dispose();
            store.Dispose();
        }
    }
}
maxbeaudoin
  • 6,546
  • 5
  • 38
  • 53
  • thanks for the code, works like a breeze! What is the best approach when Doc has a title field as well, and I want to filter the facets based on the fact that this Title contains a certain string? – Serge van den Oever Oct 16 '12 at 13:32
1

I would handle the tree-search part of this using pure Lucene for speed's sake. 2 approaches are the parent-child linkages method and the path-enumeration/'Dewey Decimal' method.

Parent-child is how we all learned to implement linked lists back in algorithm class. It is easy to update, but queries require visiting each node (you can't get directly from a parent to its great-grandchild, for example). Given that you need to visit all ancestors of a node anyway to get all of the attributes (since the idea is to share the attributes), having to visit all ancestors may be a moot point.

How to store tree data in a Lucene/Solr/Elasticsearch index or a NoSQL db? covers the path-enumeration/'Dewey Decimal' method.

Either approach can handle an arbitrarily complex hierarchy, as long as it is a true hierarchy (i.e. a directed acyclic graph (D.A.G.)).

Community
  • 1
  • 1
Mark Leighton Fisher
  • 5,609
  • 2
  • 18
  • 29
  • 1
    First, thank you for your time. However, I need to clarify one thing.. my data - stored in Esent - is not hierarchical; The different facets of my data are hierarchical. In the Lucene language, I believe they are referred to as _terms_. So the terms themselves are hierarchical. Refer to the paper I've linked above page 36 and look at the graph on the top right corner. Using your approach, I would design an index in Lucene to query hierarchical data but I'm looking for clean and simple resources to apply this concept to Lucene's terms: a hierarchical taxonomy. – maxbeaudoin Apr 20 '12 at 20:44
1

I fixed it already.

I create the index as follows:

public class ProductByCategory : AbstractIndexCreationTask<Product, ProductByCategory.ReduceResult>
{
    public class ReduceResult
    {
        public string Category { get; set; }
        public string Title { get; set; }
    }

    public ProductByCategory()
    {
        Map = products =>
              from p in products
              from c in p.Categories
              select new
              {
                  Category = c,
                  Title = p.Title
              };
        Stores.Add(x => x.Title, FieldStorage.Yes);
        Indexes.Add(x => x.Title, FieldIndexing.Analyzed);
    }
}

And I query it like:

var q = session.Query<ProductByCategory.ReduceResult, ProductByCategory>().Search(x => x.Title, "Sony")
.Where(r => r.Category.StartsWith("1/beeld en geluid")).As<Product>();

var facetResults = q.ToFacets("facets/ProductCategory");
Serge van den Oever
  • 4,340
  • 8
  • 45
  • 66