When performing a text search MongoDB assigns a score to each document that contains the search term in the indexed fields. The score represents the relevance of a document to a given text search query.
We can use the $function operator to define custom functions to implement this behavior by tokenization and removing stop words (english).
db.restaurants.insert(
[
{ _id: 1, subject: "coffee", author: "xyz", views: 50 },
{ _id: 2, subject: "Coffee Shopping", author: "efg", views: 5 },
{ _id: 3, subject: "Baking a cake", author: "abc", views: 90 },
{ _id: 4, subject: "baking", author: "xyz", views: 100 },
{ _id: 5, subject: "Café Con Leche", author: "abc", views: 200 },
{ _id: 6, subject: "Сырники", author: "jkl", views: 80 },
{ _id: 7, subject: "coffee and cream", author: "efg", views: 10 },
{ _id: 8, subject: "Cafe con Leche cafe", author: "xyz", views: 10 },
{ _id: 9, subject: "Just milk", author: "efg", views: 23 },
{ _id: 10, subject: "Milk with coffee", author: "xyz", views: 16 },
{ _id: 11, subject: "One dollar coffe", author: "efg", views: 23 },
{ _id: 12, subject: "$cafe and leche", author: "xyz", views: 16 },
{ _id: 13, subject: "cafe, CAFE, CaFe...", author: "xyz", views: 16 }
]
)
Query:
let search_keyword = "Keep calm and take a cup of Cafe (coffee)";
db.restaurants.aggregate([
{
"$match": {
"$text": {
"$search": search_keyword
}
}
},
{
"$addFields": {
"matches": {
"$function": {
body: function (subject) {
//tokenize a string using whitespace and most punctuation as delimiters.
//https://github.com/mongodb/mongo/blob/v3.2/src/mongo/db/fts/tokenizer.cpp
function tokenizer(str) {
return str.split(/[^\w]+/).map(keyword => keyword.toLowerCase()).filter(char => char);
};
//tokens
let keywords_tokens = tokenizer(search_keyword);
let subject_tokens = tokenizer(subject);
//remove the stop words from "keywords_tokens"
let stop_words_removed = keywords_tokens.filter(word => !STOP_WORDS_ENGLISH.includes(word));
//return Array<String>
return subject_tokens.filter((word) => stop_words_removed.includes(word))
},
args: ["$subject"],
"lang": "js"
}
}
}
}
]);
Output:
{ "_id" : 12, "subject" : "$cafe and leche", "author" : "xyz", "views" : 16, "matches" : [ "cafe" ] }
{ "_id" : 13, "subject" : "cafe, CAFE, CaFe...", "author" : "xyz", "views" : 16, "matches" : [ "cafe", "cafe", "cafe" ] }
{ "_id" : 10, "subject" : "Milk with coffee", "author" : "xyz", "views" : 16, "matches" : [ "coffee" ] }
{ "_id" : 1, "subject" : "coffee", "author" : "xyz", "views" : 50, "matches" : [ "coffee" ] }
{ "_id" : 7, "subject" : "coffee and cream", "author" : "efg", "views" : 10, "matches" : [ "coffee" ] }
{ "_id" : 8, "subject" : "Cafe con Leche cafe", "author" : "xyz", "views" : 10, "matches" : [ "cafe", "cafe" ] }
{ "_id" : 5, "subject" : "Café Con Leche", "author" : "abc", "views" : 200, "matches" : [ ] }
{ "_id" : 2, "subject" : "Coffee Shopping", "author" : "efg", "views" : 5, "matches" : [ "coffee" ] }
List of English stop words:
// https://github.com/mongodb/mongo/blob/v3.2/src/mongo/db/fts/stop_words_english.txt
const STOP_WORDS_ENGLISH = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];
Finally you could count the number of matches (using $size), and get the desire result.
NOTE:
- This could be improved by replacing the diacritics with a intuitive ascii character. (Café !== Cafe).
- As @AlexBlex point out in the comment
keywords_tokens
and stop_words_removed
should be calculated at the application level.