2

I am running the following query in arangodb:

LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open"  RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )

LET unorderedDatasetsIds = (
    FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
     FILTER dataset._id IN searchTarget  RETURN dataset._id
)

LET ordered = (
    FOR wl IN wordLinks
    FILTER wl._from IN unorderedDatasetsIds
    FOR x IN words
        FILTER x._id == wl._to
        COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
        SORT score
        LIMIT 0, 20
       RETURN did
)
RETURN {
    dids: ordered,
    number_of_items: LENGTH(unorderedDatasetsIds)
}

My searchwords all uses prefixes like:

pref:banana,|pref:chocollate

Basically I want to optimize this query because it's taking around 2 seconds to return. One of my ideas was to limit the number of items to 1000 in the fulltext search, but by doing so, the datasets will be random since it will depend of the order arangodb return the queries.

What kind of optimizations could I apply to this query to make it faster?

P.S.: I have an union of empty datasets, but sometimes they're not empty. Just happen in the case of this query.

EDIT My Query Explain:

Query string:
 LET catalogDatasets = []
 LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open"  RETURN d._id )
 LET myDatasets = []
 LET myPurchasedDatasets = []
 LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
 LET unorderedDatasetsIds = (
     FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
      FILTER dataset._id IN searchTarget  RETURN dataset._id
 )
 LET ordered = (
     FOR wl IN wordLinks
     FILTER wl._from IN unorderedDatasetsIds
     FOR x IN words
         FILTER x._id == wl._to
         COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
         SORT score
         LIMIT 0, 20
        RETURN did
 )
 RETURN {
     dids: ordered,
     number_of_items: LENGTH(unorderedDatasetsIds)
 }

Execution plan:
 Id   NodeType                  Est.   Comment
  1   SingletonNode                1   * ROOT
  9   SubqueryNode                 1     - LET openDatasets = ...   /* const subquery */
  3   SingletonNode                1       * ROOT
  4   EnumerateCollectionNode   9752         - FOR d IN datasets   /* full collection scan */
  5   CalculationNode           9752           - LET #19 = (d.`visibility` == "open")   /* simple expression */   /* collections used: d : datasets */
  6   FilterNode                9752           - FILTER #19
  7   CalculationNode           9752           - LET #21 = d.`_id`   /* attribute expression */   /* collections used: d : datasets */
  8   ReturnNode                9752           - RETURN #21
 41   CalculationNode              1     - LET #39 = SORTED_UNIQUE(UNIQUE(UNION([ ], openDatasets, [ ], [ ])))   /* simple expression */
 20   SubqueryNode                 1     - LET unorderedDatasetsIds = ...   /* subquery */
 13   SingletonNode                1       * ROOT
 38   IndexNode                 9752         - FOR dataset IN datasets   /* fulltext index scan */
 16   CalculationNode           9752           - LET #25 = (dataset.`_id` in /* sorted */ #39)   /* simple expression */   /* collections used: dataset : datasets */
 17   FilterNode                9752           - FILTER #25
 18   CalculationNode           9752           - LET #27 = dataset.`_id`   /* attribute expression */   /* collections used: dataset : datasets */
 19   ReturnNode                9752           - RETURN #27
 34   SubqueryNode                 1     - LET ordered = ...   /* subquery */
 21   SingletonNode                1       * ROOT
 40   IndexNode                  410         - FOR wl IN wordLinks   /* edge index scan */
 28   CalculationNode            410           - LET #33 = wl.`_from`   /* attribute expression */   /* collections used: wl : wordLinks */
 39   IndexNode                  410           - FOR x IN words   /* primary index scan */
 37   SortNode                   410             - SORT #33 ASC
 29   CalculationNode            410             - LET #35 = (wl.`invFq` / (x.`numEdges` + 0.1))   /* simple expression */   /* collections used: wl : wordLinks, x : words */
 30   CollectNode                328             - COLLECT did = #33 INTO score = #35   /* sorted */
 31   SortNode                   328             - SORT score ASC
 32   LimitNode                   20             - LIMIT 0, 20
 33   ReturnNode                  20             - RETURN did
 35   CalculationNode              1     - LET #37 = { "dids" : ordered, "number_of_items" : LENGTH(unorderedDatasetsIds) }   /* simple expression */
 36   ReturnNode                   1     - RETURN #37

Indexes used:
 By   Type       Collection   Unique   Sparse   Selectivity   Fields               Ranges
 38   fulltext   datasets     false    true             n/a   [ `word_list` ]      FULLTEXT(datasets   /* all collection documents */, "word_list", "'prefix:トウ,|prefix:とう'")
 40   edge       wordLinks    false    false         3.05 %   [ `_from`, `_to` ]   (wl.`_from` in unorderedDatasetsIds)
 39   primary    words        true     false       100.00 %   [ `_key` ]           (x.`_id` == wl.`_to`)

Optimization rules applied:
 Id   RuleName
  1   move-calculations-up
  2   move-filters-up
  3   remove-redundant-calculations
  4   remove-unnecessary-calculations
  5   move-calculations-up-2
  6   move-filters-up-2
  7   fulltext-index-optimizer
  8   use-indexes
  9   remove-filter-covered-by-index
 10   sort-in-values
 11   remove-unnecessary-calculations-2
 12   move-calculations-down
Israel Zinc
  • 2,713
  • 2
  • 18
  • 30
  • Would you share the output of `Explain` from the Query editor in the UI, please? – Kaveh Vahedipour Apr 23 '18 at 09:35
  • Done. I edited my question. – Israel Zinc Apr 23 '18 at 10:00
  • 1
    How does performance change if you add a sparse non-unique index on datasets.visibility? When you want to return paged results, the best performance is to perform the full query, first applying filtering, then applying sorting, then using limit to give a start/end value for the values returned, based on page parameters passed to the query, (Note that LIMIT can be passed two paramaters). Also, are your `searchwords` in the format of `pref:banana,|pref:chocollate` or `prefix:banana,|prefix:chocollate` – David Thomas Apr 23 '18 at 11:26
  • Did the below give you any advantage? – Kaveh Vahedipour Apr 24 '18 at 08:25

1 Answers1

2

OK. Kind of a tough proposition. The query is quite expensive. But I tried something:

LET catalogDatasets = []
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
  FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
    FILTER dataset._id IN searchTarget || d.visibility == "open" RETURN dataset._id
 )
 LET ordered = (
   FOR wl IN wordLinks
     FILTER wl._from IN unorderedDatasetsIds
     FOR x IN words
       FILTER x._id == wl._to
       COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
       SORT score
       LIMIT 0, 20
       RETURN did
 )
 RETURN {
     dids: ordered,
     number_of_items: LENGTH(unorderedDatasetsIds)
}

Don't see anything obvious here. But clearly makeing the query for openDatasets go away should be significant if there are not too many "open".

Kaveh Vahedipour
  • 3,412
  • 1
  • 14
  • 22
  • In the end we changed the way we did the query, but indeed this one speeded from 2.5 to 1.5 the query. Thank you – Israel Zinc Apr 27 '18 at 02:08