I am trying to run a chat with pdf LLM using python. I want to accelerate the embedding and inference model using my GPUs. This is my entire code https://github.com/aortiz-WW/LLM/tree/gpu_enabled:
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplate import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
import torch
from transformers import AutoModelForCausalLM
print(torch.cuda.is_available()) # This returns True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=2000,
chunk_overlap=500,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
model_kwargs = {'device': "cuda"}
embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en", model_kwargs=model_kwargs)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore
def get_conversation_chain(vectorstore):
# llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b", model_kwargs={"temperature":0.3, "max_length":2048})
llm = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b", device = device, device_map= 'auto')
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation_chain
def handle_userinput(user_question):
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
st.set_page_config(page_title="Chat with multiple PDFs",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Chat with multiple PDFs :books:")
user_question = st.text_input("Ask a question about your documents:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)
# get the text chunks
text_chunks = get_text_chunks(raw_text)
# create vector store
vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(
vectorstore)
if __name__ == '__main__':
main()
The error that I am getting is when I try to create the embeddings:
DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. CUDA call was originally invoked at: [' File "/usr/lib/python3.10/threading.py", line 973, in _bootstrap\n self._bootstrap_inner()\n', ' File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner\n self.run()\n', ' File "/usr/lib/python3.10/threading.py", line 953, in run\n self._target(*self._args, **self._kwargs)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 304, in _run_script_thread\n self._run_script(request.rerun_data)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 552, in _run_script\n exec(code, module.__dict__)\n', ' File "/home/aortiz/LLM/app.py", line 11, in <module>\n import torch\n', ' File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/__init__.py", line 1146, in <module>\n _C._initExtension(manager_path())\n', ' File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 197, in <module>\n _lazy_call(_check_capability)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 195, in _lazy_call\n _queued_calls.append((callable, traceback.format_stack()))\n']
Traceback:
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 552, in _run_script
exec(code, module.__dict__)
File "/home/aortiz/LLM/app.py", line 110, in <module>
main()
File "/home/aortiz/LLM/app.py", line 102, in main
vectorstore = get_vectorstore(text_chunks)
File "/home/aortiz/LLM/app.py", line 41, in get_vectorstore
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/langchain/vectorstores/faiss.py", line 607, in from_texts
embeddings = embedding.embed_documents(texts)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/langchain/embeddings/huggingface.py", line 162, in embed_documents
embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/InstructorEmbedding/instructor.py", line 521, in encode
self.to(device)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1145, in to
return self._apply(convert)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
[Previous line repeated 1 more time]
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 820, in _apply
param_applied = fn(param)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1143, in convert
return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 264, in _lazy_init
raise DeferredCudaCallError(msg) from e
I realize that the error is occurring because of the line in the code:
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
I believe this is because FAISS stores the embeddings in local disk and not the GPU. Is this the correct line of thought? I just need some guidance as to how to use my GPUs (or just one at least) for speeding up the whole process of embedding the pdf documents and running inference on them.