-3

I am trying to run a chat with pdf LLM using python. I want to accelerate the embedding and inference model using my GPUs. This is my entire code https://github.com/aortiz-WW/LLM/tree/gpu_enabled:

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import  HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplate import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
import torch
from transformers import AutoModelForCausalLM

print(torch.cuda.is_available()) # This returns True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=2000,
        chunk_overlap=500,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):
    
    model_kwargs = {'device': "cuda"}
    embeddings = HuggingFaceInstructEmbeddings(model_name="BAAI/bge-large-en", model_kwargs=model_kwargs)
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def get_conversation_chain(vectorstore):
    
    # llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b", model_kwargs={"temperature":0.3, "max_length":2048})
    llm =  AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b", device = device, device_map= 'auto')
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain


def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']
    
    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)


def main():
    load_dotenv()
    st.set_page_config(page_title="Chat with multiple PDFs",
                       page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Chat with multiple PDFs :books:")
    user_question = st.text_input("Ask a question about your documents:")
    if user_question:
        handle_userinput(user_question)

    with st.sidebar:
        st.subheader("Your documents")
        pdf_docs = st.file_uploader(
            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
        if st.button("Process"):
            with st.spinner("Processing"):
                # get pdf text
                raw_text = get_pdf_text(pdf_docs)

                # get the text chunks
                text_chunks = get_text_chunks(raw_text)

                # create vector store
                vectorstore = get_vectorstore(text_chunks)

                # create conversation chain
                st.session_state.conversation = get_conversation_chain(
                    vectorstore)


if __name__ == '__main__':
    main()

The error that I am getting is when I try to create the embeddings:

DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "../aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. CUDA call was originally invoked at: [' File "/usr/lib/python3.10/threading.py", line 973, in _bootstrap\n self._bootstrap_inner()\n', ' File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner\n self.run()\n', ' File "/usr/lib/python3.10/threading.py", line 953, in run\n self._target(*self._args, **self._kwargs)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 304, in _run_script_thread\n self._run_script(request.rerun_data)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 552, in _run_script\n exec(code, module.__dict__)\n', ' File "/home/aortiz/LLM/app.py", line 11, in <module>\n import torch\n', ' File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/__init__.py", line 1146, in <module>\n _C._initExtension(manager_path())\n', ' File "<frozen importlib._bootstrap>", line 1027, in _find_and_load\n', ' File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked\n', ' File "<frozen importlib._bootstrap>", line 688, in _load_unlocked\n', ' File "<frozen importlib._bootstrap_external>", line 883, in exec_module\n', ' File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 197, in <module>\n _lazy_call(_check_capability)\n', ' File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 195, in _lazy_call\n _queued_calls.append((callable, traceback.format_stack()))\n']
Traceback:
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 552, in _run_script
    exec(code, module.__dict__)
File "/home/aortiz/LLM/app.py", line 110, in <module>
    main()
File "/home/aortiz/LLM/app.py", line 102, in main
    vectorstore = get_vectorstore(text_chunks)
File "/home/aortiz/LLM/app.py", line 41, in get_vectorstore
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/langchain/vectorstores/faiss.py", line 607, in from_texts
    embeddings = embedding.embed_documents(texts)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/langchain/embeddings/huggingface.py", line 162, in embed_documents
    embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/InstructorEmbedding/instructor.py", line 521, in encode
    self.to(device)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1145, in to
    return self._apply(convert)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
    module._apply(fn)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
    module._apply(fn)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 797, in _apply
    module._apply(fn)
[Previous line repeated 1 more time]
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 820, in _apply
    param_applied = fn(param)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1143, in convert
    return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
File "/home/aortiz/LLM/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 264, in _lazy_init
    raise DeferredCudaCallError(msg) from e

I realize that the error is occurring because of the line in the code:

vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)

I believe this is because FAISS stores the embeddings in local disk and not the GPU. Is this the correct line of thought? I just need some guidance as to how to use my GPUs (or just one at least) for speeding up the whole process of embedding the pdf documents and running inference on them.

talonmies
  • 70,661
  • 34
  • 192
  • 269
Angel Ortiz
  • 72
  • 13
  • I understand. But I don't see how to solve my problem. I see that the PDF embeddings are not then stored in the GPU, but how can I use the GPU to speed up my process – Angel Ortiz Aug 17 '23 at 12:18
  • What do you recommend I do with this problem then? Because it is giving me a cuda error. I don't see how a filesystem helps – Angel Ortiz Aug 17 '23 at 12:23
  • What can I do then about the DeferredCudaCallError that I am facing because of the get_vectorstore function? – Angel Ortiz Aug 17 '23 at 13:23

0 Answers0