I am new to LangChain and I was trying to implement a simple Q & A system based on an example tutorial online.
The code is as follows:
from langchain.llms import LlamaCpp
from langchain.llms import gpt4all
from langchain.embeddings import LlamaCppEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
def write_text_file(content, file_path):
try:
with open(file_path, 'w') as file:
file.write(content)
return True
except Exception as e:
print(f"Error occurred while writing the file: {e}")
return False
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
llm = LlamaCpp(model_path="airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q2_K.bin")
embeddings = LlamaCppEmbeddings(model_path="airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q2_K.bin")
llm_chain = LLMChain(llm=llm, prompt=prompt)
file_path = "corpus_v1.txt"
loader = TextLoader(file_path)
docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
db = Chroma.from_documents(texts, embeddings)
question = "What is ant–fungus mutualism?"
similar_doc = db.similarity_search(question, k=1)
context = similar_doc[0].page_content
query_llm = LLMChain(llm=llm, prompt=prompt)
response = query_llm.run({"context": context, "question": question})
print(response)
The data can be found here. The model used here can be found in this link.
I am getting the following error
llama_tokenize_with_model: too many tokens
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[10], line 6
4 text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
5 texts = text_splitter.split_documents(docs)
----> 6 db = Chroma.from_documents(texts, embeddings)
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:603, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
601 texts = [doc.page_content for doc in documents]
602 metadatas = [doc.metadata for doc in documents]
--> 603 return cls.from_texts(
604 texts=texts,
605 embedding=embedding,
606 metadatas=metadatas,
607 ids=ids,
608 collection_name=collection_name,
609 persist_directory=persist_directory,
610 client_settings=client_settings,
611 client=client,
612 collection_metadata=collection_metadata,
613 **kwargs,
614 )
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:567, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
539 """Create a Chroma vectorstore from a raw documents.
540
541 If a persist_directory is specified, the collection will be persisted there.
(...)
556 Chroma: Chroma vectorstore.
557 """
558 chroma_collection = cls(
559 collection_name=collection_name,
560 embedding_function=embedding,
(...)
565 **kwargs,
566 )
--> 567 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
568 return chroma_collection
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/langchain/vectorstores/chroma.py:187, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
185 texts = list(texts)
186 if self._embedding_function is not None:
--> 187 embeddings = self._embedding_function.embed_documents(texts)
188 if metadatas:
189 # fill metadatas with empty dicts if somebody
190 # did not specify metadata for all texts
191 length_diff = len(texts) - len(metadatas)
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/langchain/embeddings/llamacpp.py:110, in LlamaCppEmbeddings.embed_documents(self, texts)
101 def embed_documents(self, texts: List[str]) -> List[List[float]]:
102 """Embed a list of documents using the Llama model.
103
104 Args:
(...)
108 List of embeddings, one for each text.
109 """
--> 110 embeddings = [self.client.embed(text) for text in texts]
111 return [list(map(float, e)) for e in embeddings]
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/langchain/embeddings/llamacpp.py:110, in <listcomp>(.0)
101 def embed_documents(self, texts: List[str]) -> List[List[float]]:
102 """Embed a list of documents using the Llama model.
103
104 Args:
(...)
108 List of embeddings, one for each text.
109 """
--> 110 embeddings = [self.client.embed(text) for text in texts]
111 return [list(map(float, e)) for e in embeddings]
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/llama_cpp/llama.py:812, in Llama.embed(self, input)
803 def embed(self, input: str) -> List[float]:
804 """Embed a string.
805
806 Args:
(...)
810 A list of embeddings
811 """
--> 812 return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/llama_cpp/llama.py:776, in Llama.create_embedding(self, input, model)
774 tokens = self.tokenize(input.encode("utf-8"))
775 self.reset()
--> 776 self.eval(tokens)
777 n_tokens = len(tokens)
778 total_tokens += n_tokens
File ~/miniconda3/envs/tensorflow/lib/python3.10/site-packages/llama_cpp/llama.py:471, in Llama.eval(self, tokens)
469 raise RuntimeError(f"llama_eval returned {return_code}")
470 # Save tokens
--> 471 self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
472 # Save logits
473 rows = n_tokens if self.params.logits_all else 1
ValueError: could not broadcast input array from shape (8,) into shape (0,)
This error did not occur when the text length in the corpus was shorter. Is there a parameter that we need to change?
These are the libraries and their versions
langchain -> '0.0.252'
numpy -> '1.25.0'
Thanks in advance!