I am running the following query in arangodb:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
My searchwords all uses prefixes like:
pref:banana,|pref:chocollate
Basically I want to optimize this query because it's taking around 2 seconds to return. One of my ideas was to limit the number of items to 1000 in the fulltext search, but by doing so, the datasets will be random since it will depend of the order arangodb return the queries.
What kind of optimizations could I apply to this query to make it faster?
P.S.: I have an union of empty datasets, but sometimes they're not empty. Just happen in the case of this query.
EDIT My Query Explain:
Query string:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
Execution plan:
Id NodeType Est. Comment
1 SingletonNode 1 * ROOT
9 SubqueryNode 1 - LET openDatasets = ... /* const subquery */
3 SingletonNode 1 * ROOT
4 EnumerateCollectionNode 9752 - FOR d IN datasets /* full collection scan */
5 CalculationNode 9752 - LET #19 = (d.`visibility` == "open") /* simple expression */ /* collections used: d : datasets */
6 FilterNode 9752 - FILTER #19
7 CalculationNode 9752 - LET #21 = d.`_id` /* attribute expression */ /* collections used: d : datasets */
8 ReturnNode 9752 - RETURN #21
41 CalculationNode 1 - LET #39 = SORTED_UNIQUE(UNIQUE(UNION([ ], openDatasets, [ ], [ ]))) /* simple expression */
20 SubqueryNode 1 - LET unorderedDatasetsIds = ... /* subquery */
13 SingletonNode 1 * ROOT
38 IndexNode 9752 - FOR dataset IN datasets /* fulltext index scan */
16 CalculationNode 9752 - LET #25 = (dataset.`_id` in /* sorted */ #39) /* simple expression */ /* collections used: dataset : datasets */
17 FilterNode 9752 - FILTER #25
18 CalculationNode 9752 - LET #27 = dataset.`_id` /* attribute expression */ /* collections used: dataset : datasets */
19 ReturnNode 9752 - RETURN #27
34 SubqueryNode 1 - LET ordered = ... /* subquery */
21 SingletonNode 1 * ROOT
40 IndexNode 410 - FOR wl IN wordLinks /* edge index scan */
28 CalculationNode 410 - LET #33 = wl.`_from` /* attribute expression */ /* collections used: wl : wordLinks */
39 IndexNode 410 - FOR x IN words /* primary index scan */
37 SortNode 410 - SORT #33 ASC
29 CalculationNode 410 - LET #35 = (wl.`invFq` / (x.`numEdges` + 0.1)) /* simple expression */ /* collections used: wl : wordLinks, x : words */
30 CollectNode 328 - COLLECT did = #33 INTO score = #35 /* sorted */
31 SortNode 328 - SORT score ASC
32 LimitNode 20 - LIMIT 0, 20
33 ReturnNode 20 - RETURN did
35 CalculationNode 1 - LET #37 = { "dids" : ordered, "number_of_items" : LENGTH(unorderedDatasetsIds) } /* simple expression */
36 ReturnNode 1 - RETURN #37
Indexes used:
By Type Collection Unique Sparse Selectivity Fields Ranges
38 fulltext datasets false true n/a [ `word_list` ] FULLTEXT(datasets /* all collection documents */, "word_list", "'prefix:トウ,|prefix:とう'")
40 edge wordLinks false false 3.05 % [ `_from`, `_to` ] (wl.`_from` in unorderedDatasetsIds)
39 primary words true false 100.00 % [ `_key` ] (x.`_id` == wl.`_to`)
Optimization rules applied:
Id RuleName
1 move-calculations-up
2 move-filters-up
3 remove-redundant-calculations
4 remove-unnecessary-calculations
5 move-calculations-up-2
6 move-filters-up-2
7 fulltext-index-optimizer
8 use-indexes
9 remove-filter-covered-by-index
10 sort-in-values
11 remove-unnecessary-calculations-2
12 move-calculations-down