I am using Django and Postgresql Full-Text Search to generate a paginated search result for my users. I rank and highlight my results.
qs = (
self.filter(Q(title_vector=SearchQuery(search_term)))
.annotate(rank=rank)
.order_by("-rank", "pk")
.annotate(
title_snippet=SearchHeadline(
"title",
SearchQuery(search_term),
highlight_all=True
)
)
.distinct()
)
In performance testing, I've identified that the SearchHeadline function is extremely heavy (as mentioned in the documentation) and is applied to every single row in the result - even if there are thousands of results being paginated in sets of 25. This makes the query take too long to complete. Another user on StackOverflow had the same issue as mine and discovered that only running ts_headline
against the paginated results significantly improves performance (Relevant Question). I tested and confirmed that a similar approach would solve my problem.
My question is how can I tweak my Django code to generate a SQL query against a query like the below example? After the Paginator applies a limit and offset to the result set, only then run ts_headline
against those results only.
SELECT
id,
ts_headline('italian', body, to_tsquery('italian', 'torino')) as headline,
rank,
title,
location
FROM (
SELECT
id,
body,
title,
location,
ts_rank(body_tsvector, query) as rank
FROM
fulltextsearch.documents, to_tsquery('italian', 'torino') as query
WHERE
to_tsvector('italian', coalesce(body,'')) @@ query
LIMIT 10
OFFSET 0
) as s
EDIT: Here is the current (unoptimized) query:
SELECT DISTINCT
"dockets_document"."title",
"dockets_document"."content",
"dockets_document"."title_vector",
"dockets_document"."search_vector",
(1 * ts_rank("dockets_document"."search_vector", plainto_tsquery('public.bb'::regconfig, 'hearing'))) AS "rank",
ts_headline('public.bb'::regconfig, "dockets_document"."title", to_tsquery('hearing'), 'StartSel=''<mark>'', StopSel=''</mark>'', HighlightAll=true') AS "title_snippet",
ts_headline('public.bb'::regconfig, array_to_string("dockets_document"."content", ' ', ''), to_tsquery('hearing'), 'StartSel=''<mark>'', StopSel=''</mark>'', MaxFragments=5') AS "content_snippet"
FROM "dockets_document"
WHERE
"dockets_document"."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'hearing')
ORDER BY "rank" DESC, "dockets_document"."id" ASC
LIMIT 25;
Here is the hand-edited desired query (10x speedup):
SELECT
ts_headline('public.bb'::regconfig, title, to_tsquery('hearing'), 'StartSel=''<mark>'', StopSel=''</mark>'', HighlightAll=true') AS "title_snippet",
ts_headline('public.bb'::regconfig, array_to_string(content, ' ', ''), to_tsquery('hearing'), 'StartSel=''<mark>'', StopSel=''</mark>'', MaxFragments=5') AS "content_snippet"
FROM (
SELECT DISTINCT
"dockets_document"."title" as title,
"dockets_document"."content" as content,
(1 * ts_rank("dockets_document"."search_vector", plainto_tsquery('public.bb'::regconfig, 'hearing'))) AS "rank"
FROM "dockets_document"
WHERE
"dockets_document"."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'hearing')
ORDER BY "rank" DESC, "dockets_document"."id" ASC
LIMIT 25 ) as s;
EDIT 2 The DISTINCT is the problem.
Here is my full query to see how I could eliminate the DISTINCT:
SELECT "dockets_document"."id",
"dockets_document"."title",
"dockets_document"."url",
"dockets_document"."content",
"dockets_document"."num_tables",
"dockets_document"."num_pages",
"dockets_document"."tables_processed",
"dockets_document"."summary",
"dockets_document"."importance",
"dockets_document"."title_vector",
"dockets_document"."search_vector",
"dockets_document"."datetime_created",
"dockets_document"."datetime_modified",
"dockets_document"."docket_id",
"dockets_document"."docket_number",
"dockets_document"."date_filed",
"dockets_document"."pdf",
((((1 * ts_rank("dockets_document"."search_vector", plainto_tsquery('public.bb'::regconfig, 'knighthead'))) + (0.8 * ts_rank("dockets_attachment"."search_vector", plainto_tsquery('public.bb'::regconfig, 'knighthead')))) + (1.0 * ts_rank("dockets_table"."search_vector", plainto_tsquery('public.bb'::regconfig, 'knighthead')))) + (0.8 * ts_rank(T5."search_vector", plainto_tsquery('public.bb'::regconfig, 'knighthead')))) AS "rank",
ts_headline('public.bb'::regconfig, "dockets_document"."title", to_tsquery('public.bb'::regconfig, 'knighthead'), 'StartSel=''<mark>'', StopSel=''</mark>'', HighlightAll=true') AS "title_snippet",
ts_headline('public.bb'::regconfig, CONCAT(array_to_string("dockets_document"."content", ' ', ''), CONCAT(array_to_string("dockets_attachment"."content", ' ', ''), CONCAT(array_to_string("dockets_table"."content", ' ', ''), array_to_string(T5."content", ' ', '')))), to_tsquery('public.bb'::regconfig, 'knighthead'), 'StartSel=''<mark>'', StopSel=''</mark>'', MaxFragments=5') AS "content_snippet"
FROM "dockets_document"
LEFT OUTER JOIN "dockets_attachment"
ON ("dockets_document"."id" = "dockets_attachment"."main_document_id")
LEFT OUTER JOIN "dockets_table"
ON ("dockets_document"."id" = "dockets_table"."object_id" AND ("dockets_table"."content_type_id" = 8))
LEFT OUTER JOIN "dockets_table" T5
ON ("dockets_attachment"."id" = T5."object_id" AND (T5."content_type_id" = 24))
WHERE ("dockets_document"."docket_id" = 'p_LATAM' AND NOT ("dockets_document"."title_vector" @@ ((((((((phraseto_tsquery('public.bb'::regconfig, 'pro hac') || phraseto_tsquery('public.bb'::regconfig, 'certificate of mailing')) || phraseto_tsquery('public.bb'::regconfig, 'affidavit/declaration of mailing')) || phraseto_tsquery('public.bb'::regconfig, 'request for service')) || phraseto_tsquery('public.bb'::regconfig, 'certification of counsel')) || phraseto_tsquery('public.bb'::regconfig, 'certification of no objection')) || phraseto_tsquery('public.bb'::regconfig, 'receipt of filing fee')) || phraseto_tsquery('public.bb'::regconfig, 'affidavit of service')) || phraseto_tsquery('public.bb'::regconfig, 'Transfer/Assignment of Claim')) AND "dockets_document"."title_vector" IS NOT NULL) AND ("dockets_document"."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'knighthead') OR "dockets_attachment"."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'knighthead') OR "dockets_table"."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'knighthead') OR T5."search_vector" @@ plainto_tsquery('public.bb'::regconfig, 'knighthead')) AND "dockets_document"."num_pages" >= 0 AND "dockets_document"."num_tables" >= 0)
ORDER BY "rank" DESC, "dockets_document"."id" ASC
LIMIT 25