0

I have around 30 GB of JSON data with multiple files, wanted build query bot on this. I have built same with text file but i am not sure how it will work for JSON data.

I have explored JSONLoader but dont know how to use this to convert JSON data into vector and store it into ChromaDB so that i can query them. https://python.langchain.com/docs/modules/data_connection/document_loaders/json

spma.json

[
 {
   "class": "ham",
   "message": "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
 },
 {
   "class": "ham",
   "message": "Ok lar... Joking wif u oni..."
 },
 {
   "class": "spam",
   "message": "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 },
 {
   "class": "ham",
   "message": "U dun say so early hor... U c already then say..."
 },
 {
   "class": "ham",
   "message": "Nah I don't think he goes to usf, he lives around here though"
 },
 {
   "class": "spam",
   "message": "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, ��1.50 to rcv"
 }
]

Code for Text data:

# Loading and Splitting the Documents
from langchain.document_loaders import DirectoryLoader

directory = '/content/drive/MyDrive/Data Science/LLM/docs/text files'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)


from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

# Embedding Text Using Langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

#Creating Vector Store with Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_db"

vectordb = Chroma.from_documents(
    documents=docs, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()

#Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"

from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

#Extracting Answers from Documents
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)

query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer =  chain.run(input_documents=matching_docs, question=query)
answer

What I tried for JSON Data :

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
import json

# Define a simple JSON schema (modify as needed)
json_schema = {
    "class": "string",
    "message": "string"
}

# Function to validate a JSON document against a schema
def validate_json(json_data, schema):
    return all(key in json_data for key in schema.keys())

# 1. Load JSON Files
def load_json_docs(directory):
    loader = DirectoryLoader(directory, glob='**/*.json', loader_cls=JSONLoader)
    documents = loader.load()
    
    # Manually filter and validate documents based on the JSON schema
    valid_documents = []
    for doc in documents:
        try:
            # Parse the JSON content
            json_data = json.loads(doc.page_content)
            if validate_json(json_data, json_schema):
                valid_documents.append(doc)
        except json.JSONDecodeError:
            pass  # Invalid JSON format, skip this document
    
    return valid_documents

directory = '/content/drive/MyDrive/Data Science/LLM/docs/json files'
json_documents = load_json_docs(directory)
len(json_documents)

# 2. Split JSON Documents
def split_json_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

split_json_documents = split_json_docs(json_documents)
print(len(split_json_documents))

# 3. Embedding Text Using Langchain
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# 4. Creating Vector Store with Chroma DB
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_json_db"

vectordb = Chroma.from_documents(
    documents=split_json_documents, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()


# 5. Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"

model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

# 6. Extracting Answers from Documents
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)

query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Juned Ansari
  • 5,035
  • 7
  • 56
  • 89

0 Answers0