I want to train LLM on custom dataset. This is my code:
from llama_index import SimpleDirectoryReader, GPTListIndex, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
import os
apikey = "sk-fzDu2MtAdhPiR"
os.environ["OPENAI_API_KEY"] = apikey
def createVectorIndex(path):
max_input = 4000
tokens = 256
chunk_size = 600
max_chunk_overlap = 0.20
prompt_helper = PromptHelper(max_input, tokens, max_chunk_overlap, chunk_size_limit = chunk_size)
#define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature = 0, model_name = "text-ada-001", max_tokens = tokens))
#load data
data = SimpleDirectoryReader(path).load_data()
vector_index = GPTVectorStoreIndex(documents = data,
llm_predictor = llm_predictor,
prompt_helper = prompt_helper,
index_name="my_index")
vector_index.save_to_disc("vector_index.json")
return vector_index
vector_index = createVectorIndex("gpt_data")
I am getting this error:
ValueError Traceback (most recent call last)
<ipython-input-39-de1d095ec55f> in <module>
----> 1 vector_index = createVectorIndex("gpt_data")
<ipython-input-38-f5b73669d881> in createVectorIndex(path)
17 print(i)
18
---> 19 vector_index = GPTVectorStoreIndex(documents = data,
20 llm_predictor = llm_predictor,
21 prompt_helper = prompt_helper,
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/llama_index/indices/vector_store/base.py in __init__(self, nodes, index_struct, service_context, storage_context, use_async, store_nodes_override, **kwargs)
43 self._use_async = use_async
44 self._store_nodes_override = store_nodes_override
---> 45 super().__init__(
46 nodes=nodes,
47 index_struct=index_struct,
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/llama_index/indices/base.py in __init__(self, nodes, index_struct, storage_context, service_context, **kwargs)
43 """Initialize with parameters."""
44 if index_struct is None and nodes is None:
---> 45 raise ValueError("One of documents or index_struct must be provided.")
46 if index_struct is not None and nodes is not None:
47 raise ValueError("Only one of documents or index_struct can be provided.")
ValueError: One of documents or index_struct must be provided.
I have tried with and without index_name="my_index"
, error is the same.
"gpt_data" is folder with 1 txt file.
This line:
data = SimpleDirectoryReader(path).load_data()
Gives me this:
[Document(text="Kimberlites are ... m thermochemica, doc_id='5bbe7a75-7817-4c00-bce1-db9256fde270', embedding=None, doc_hash='87fb32e4834c847365178498a6ce166a68df5d5547c8631df46d577616dd37e8', extra_info=None)]
What am I doing wrong?