I'm currently working on loading pre-vectorized text data into a Chroma vector database with jupyter notebook. However, I've encountered an issue where I'm receiving a "bad allocation" error.
To provide some context, I have an existing collection in my local disk that contains 500K rows of text data. When I attempt to reload this existing collection and add more text data to it, I encounter the "bad allocation" error.
Based on my understanding, I believe that the error occurs because the client is unable to handle such a large file and send it to the RAM. My question is, how should the design be structure our code trying to import and persist the data so that we can avoid memory allocation error ?
I've tried doing a persist(client_wiki_ja.persist()) after a batch and after a single file that contains 100k text data rows, but it remains to have the same issue.
Client initialize and retrieving existing collection from local disk
client_wiki_ja = chromadb.Client(Settings(
chroma_db_impl="duckdb+parquet",
persist_directory= database_directory
))
cohere_ef = embedding_functions.CohereEmbeddingFunction(
api_key=api_key_embedding_cohere,
model_name="multilingual-22-12")
collection_wiki_ja = client_wiki_ja.get_collection(name=collection_name,embedding_function=cohere_ef) # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.
Import text data to vector database
def load_files(batch_size=500,max_files_load=2,startFile_index=4): # startFile_index starts from 0 (e.g.train-00003-of-00035-99c97d6c1602f565.parquet, then startFile_index=3 )
directory = r"C:\Users\liewg\Desktop\p005-butumon-go\wikipedia-22-12-ja-embeddings\data" # wiki data located
file_list = os.listdir(directory)
counter = 0 # how many files to insert
for idx, filename in enumerate(file_list): # mutliple file
if idx < startFile_index:
continue # Skip the first two files
file_path = os.path.join(directory, filename) # single parquet file
print(file_path)
df = pd.read_parquet(file_path, engine='pyarrow')
# df = df.head(3)
df['emb'] = df['emb'].apply(lambda x: x.tolist())
print("finished convert to list...")
# print("number of files: ",df.count())
rows = df# .head(10000)
for i in tqdm(range(0, len(rows), batch_size)): # per file
i_end = min(len(rows), i+batch_size) # gets end point of a batch
batch = rows[i:i_end]
embeds_batch = [ emb for emb in batch['emb']]
text_batch = [ text for text in batch['text']]
metadatas_batch = [ {"source": title} for title in batch['title']]
ids_batch = [ str(id) for id in batch['id']]
collection_wiki_ja.add(
embeddings= embeds_batch,
documents= text_batch,
metadatas=metadatas_batch,
ids = ids_batch
)
print("batch:", i)
print("persist starts")
client_wiki_ja.persist() # for after initial
print("persist ends")
# if i == 2:
# break
print("completed")
counter += 1
if counter == max_files_load :
break
Error
Error Traceback (most recent call last)
Cell In[9], line 1
----> 1 load_files()
Cell In[8], line 42, in load_files(batch_size, max_files_load, startFile_index)
40 print("batch:", i)
41 print("persist starts")
---> 42 client_wiki_ja.persist() # for after initial
43 print("persist ends")
45 # if i == 2:
46 # break
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\chromadb\api\local.py:544, in LocalAPI.persist(self)
536 @override
537 def persist(self) -> bool:
538 """Persist the database to disk.
539
540 Returns:
541 True if the database was persisted successfully
542
543 """
--> 544 self._db.persist()
545 return True
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\chromadb\db\duckdb.py:473, in PersistentDuckDB.persist(self)
470 if self._conn.query("SELECT COUNT() FROM embeddings") == 0:
471 return
--> 473 self._conn.execute(
474 f"""```
475 COPY
476 (SELECT * FROM embeddings)
477 TO '{self._save_folder}/chroma-embeddings.parquet'
478 (FORMAT PARQUET);
479 """
480 )
482 self._conn.execute(
483 f"""
484 COPY
(...)
488 """
489 )
Error: Invalid Error: bad allocation