Here is how I am creating the Azure search index for cosmos db documents with SearchRequest Model (has excluded some fields from SearchRequest Model for brevity).
Please suggest the changes needed in the below implementation to prevent the edgeNgramTokenFilterV2 token filter not to break the words at a hyphen.
public class SearchRequest
{
[SimpleField(IsKey = true, IsFilterable = true)]
public string id { get; set; }
[SearchableField(SearchAnalyzerName = LexicalAnalyzerName.Values.StandardLucene, IndexAnalyzerName = "prefixEdgeAnalyzer")]
public string EntityID { get; set; }
public MetaData? MetaData { get; set; }
}
public class MetaData
{
[SearchableField(AnalyzerName = LexicalAnalyzerName.Values.EnMicrosoft)]
public string? CustomerName { get; set; }
[SearchableField(SearchAnalyzerName = LexicalAnalyzerName.Values.StandardLucene, IndexAnalyzerName = "prefixEdgeAnalyzer")]
public List<string>? OpportunityIDs { get; set; }
}
public async Task<Response<SearchIndex>> CreateIndex(string indexName)
{
try
{
var nedgeTokenfilter = new EdgeNGramTokenFilter("edgeNgramTokenFilterV2");
nedgeTokenfilter.MinGram = 3;
nedgeTokenfilter.MaxGram = 20;
nedgeTokenfilter.Side = EdgeNGramTokenFilterSide.Front;
var prefixEdgeAnalyzer = new CustomAnalyzer("prefixEdgeAnalyzer", LexicalTokenizerName.Standard);
prefixEdgeAnalyzer.TokenFilters.Add(TokenFilterName.Lowercase);
prefixEdgeAnalyzer.TokenFilters.Add("edgeNgramTokenFilterV2");
var suggester = new SearchSuggester("spellCheckSuggester", $"MetaData/{nameof(SearchRequest.MetaData.CustomerName)}"); //for spell check
FieldBuilder fieldBuilder = new FieldBuilder();
var searchFields = fieldBuilder.Build(typeof(SearchRequest));
var definition = new SearchIndex(indexName, searchFields);
definition.TokenFilters.Add(nedgeTokenfilter);
definition.Analyzers.Add(prefixEdgeAnalyzer);
definition.Suggesters.Add(suggester);
var response = await _adminClient.CreateOrUpdateIndexAsync(definition).ConfigureAwait(false);
return response;
}
catch (Exception ex)
{
_logger.LogError(ex, ex.Message);
throw;
}
}
On using Analyze API, I can see that text - "7-ETREW" if tokenised as etr, etre, etrew. While I need to get tokenized as 7-e, 7-et, 7-etr, 7-etre, 7-etrew.
https://{myServicename}.search.windows.net/indexes/{MyIndexname}/analyze?api-version=2020-06-30
{
"text": "7-etrew",
"analyzer": "prefixEdgeAnalyzer"
}