0

I want to train LLM on custom dataset. This is my code:

from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
import os

apikey = "sk-fzDu2MtAdhPiR"
os.environ["OPENAI_API_KEY"] = apikey


def createVectorIndex(path):
    
    max_input = 4000
    tokens = 256
    chunk_size = 600
    max_chunk_overlap = 0.20
    
    prompt_helper = PromptHelper(max_input, tokens, max_chunk_overlap, chunk_size_limit = chunk_size)
    
    #define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature = 0, model_name = "text-ada-001", max_tokens = tokens))
    
    #load data
    data = SimpleDirectoryReader(path).load_data()

    vector_index = GPTVectorStoreIndex(documents = data, 
                                       llm_predictor = llm_predictor, 
                                       prompt_helper = prompt_helper, 
                                       index_name="my_index")
    
    vector_index.save_to_disc("vector_index.json")
    
    return vector_index


vector_index = createVectorIndex("gpt_data")

I am getting this error:

ValueError                                Traceback (most recent call last)
<ipython-input-39-de1d095ec55f> in <module>
----> 1 vector_index = createVectorIndex("gpt_data")

<ipython-input-38-f5b73669d881> in createVectorIndex(path)
     17         print(i)
     18 
---> 19     vector_index = GPTVectorStoreIndex(documents = data, 
     20                                        llm_predictor = llm_predictor,
     21                                        prompt_helper = prompt_helper,

/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/llama_index/indices/vector_store/base.py in __init__(self, nodes, index_struct, service_context, storage_context, use_async, store_nodes_override, **kwargs)
     43         self._use_async = use_async
     44         self._store_nodes_override = store_nodes_override
---> 45         super().__init__(
     46             nodes=nodes,
     47             index_struct=index_struct,

/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/llama_index/indices/base.py in __init__(self, nodes, index_struct, storage_context, service_context, **kwargs)
     43         """Initialize with parameters."""
     44         if index_struct is None and nodes is None:
---> 45             raise ValueError("One of documents or index_struct must be provided.")
     46         if index_struct is not None and nodes is not None:
     47             raise ValueError("Only one of documents or index_struct can be provided.")

ValueError: One of documents or index_struct must be provided.

I have tried with and without index_name="my_index", error is the same.

"gpt_data" is folder with 1 txt file.

This line:

data = SimpleDirectoryReader(path).load_data()

Gives me this:

[Document(text="Kimberlites are ... m thermochemica, doc_id='5bbe7a75-7817-4c00-bce1-db9256fde270', embedding=None, doc_hash='87fb32e4834c847365178498a6ce166a68df5d5547c8631df46d577616dd37e8', extra_info=None)]

What am I doing wrong?

Helen
  • 87,344
  • 17
  • 243
  • 314
taga
  • 3,537
  • 13
  • 53
  • 119

0 Answers0