0

I'm trying to periodically update my Deep Lake dataset when I get new documents, but I don't want to upload duplicate documents. I'm using the Python API

def combine_piazza_docs(directory, filter, split=False):
    loader = DirectoryLoader(f"./{directory}", glob=filter)
    # TODO: I don't think that there is a need to split the Piazza documents
    #   because they're all split up already!
    #   however, this will be necessary for loading in the textbook and such
    print(f"Loading {directory} directory for any {filter}")
    data = loader.load()
    if split:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=1000,
        )
        print(f"Splitting {len(data)} documents")
        r_docs = splitter.split_documents(data)
        return r_docs
    
    print(f"Created {len(data)} documents from {directory}")
    return data

chunked_text = combine_piazza_docs(SOURCE_DOCUMENTS_DIR, SOURCE_DOCUMENTS_FILTER)


# Will download the model the first time it runs (slowly)
embedding_function = SentenceTransformerEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    cache_folder="cache/",
)

print("Complete")

# This will automatically create a new Deep Lake dataset if the dataset_path does not exist
vector_store = DeepLake.from_documents(
            chunked_text,
            embedding_function,
            dataset_path=VECTOR_STORE_PATH,
            token="token",
        )

The documents I'm currently loading in aren't split so I don't think that's an issue

Eastman
  • 3
  • 3

0 Answers0