I want all the articles related to specific tag example sustainability from documents. But it is only returing me Four articles. There are total 7 articles related to sustainability in vectorstore out of 20 articles.
Here is my code:
import pinecone
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import os
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
pinecone_api_key = "xxxxxxxx"
pinecone_env = "xxxxxxxxxx"
# pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENV"])
pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
embeddings = OpenAIEmbeddings()
index_name="langchain-self-retriever-ppo"
index = pinecone.Index(index_name)
text_field = "text"
# vectorstore = Pinecone.from_documents(
# docs, embeddings, index_name="langchain-self-retriever-demo"
# )
vectorstore = Pinecone(
index, embeddings.embed_query, text_field
)
metadata_field_info=[
AttributeInfo(
name="headline",
description="The headline of the news article",
type="string or list[string]",
),
AttributeInfo(
name="date",
description="The date, news article was published",
type="integer",
),
AttributeInfo(
name="publication",
description="The name of the publication which published this news article",
type="string",
),
AttributeInfo(
name="domain",
description="The domain of the news article",
type="float"
),
]
document_content_description = "Brief summary of a news article"
llm = OpenAI(temperature=0)
# retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)
retriever = SelfQueryRetriever.from_llm(
llm,
vectorstore,
document_content_description,
metadata_field_info,
enable_limit=True,
verbose=True
)
# This example only specifies a relevant query
retrieved_docs = retriever.get_relevant_documents("Articles which are related to sustainability")
print(retrieved_docs)
print(len(retrieved_docs))
I have gone inside get_relevant_documents
method here it uses self.vectorstore.search
which calls self.similarity_search
method which by defaults sets limit to 4 if not given.
I tried setting limit to 7 it returned 7 sustainability
articles.
But I wouldn't know how much articles will be related to sustainability
so I can't by default set the limit.