My Python script to tokenize and chunk text files, then insert these chunks into an SQLite database seems to work for some files. Only some of the chunks are inserted for certain files. The script is using the tokenizers library for text processing and SQLite for database storage.
The chunks are saved to binary files and simultaneously inserted into database. For some files only a subset of chunks (often the last chunks) are actually inserted into database. The issue doesn't seem to be related to naming of files or encoding of text.
import io
import os
import glob
from tokenizers import Tokenizer
from tokenizers.models import BPE
import sqlite3
chunk_size = 4096
files_to_process = [
"howtooutsmartthecb.txt",
"fdcpa.txt",
"fcra.txt",
"doddfrankact.txt",
"consumerrights.txt",
"consumerprotectioninstates.txt",
"cfpbconsumerlaw.txt",
"blackslaw6th.txt"
]
db_path = "document_chunks.db"
connection = sqlite3.connect(db_path)
cursor = connection.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS chunks(
id INTEGER PRIMARY KEY,
document_name TEXT,
chunk_index INTEGER,
chunk_data BLOB
)
''')
connection.commit()
for file_name in files_to_process:
with open(file_name, 'rb') as f:
file_data = f.read()
text = file_data.decode('utf-8')
tokenizer = Tokenizer(BPE())
tokenizer.train(files_to_process)
encoding = tokenizer.encode(text)
tokens = encoding.tokens
chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
binary_chunks = ["".join(chunk).encode('utf-8') for chunk in chunks]
for index, chunk in enumerate(binary_chunks):
output_file_path = f"chunk_{file_name}_{index}.bin"
try:
with open(output_file_path, 'wb') as output_file:
output_file.write(chunk)
print(f"Saved chunk to {output_file_path}")
except Exception as e:
print(f"Error saving chunk to {output_file_path}: {e}")
for file_name in files_to_process:
for index, _ in enumerate(binary_chunks):
try:
chunk_path = f"chunk_{file_name}_{index}.bin"
with open(chunk_path, "rb") as chunk_file:
chunk_data = chunk_file.read()
insert_query = "INSERT INTO chunks (document_name, chunk_index, chunk_data) VALUES (?, ?, ?)"
cursor.execute(insert_query, (file_name, index, chunk_data))
print(f"Inserted chunk from {file_name} into the database")
except Exception as e:
print(f"Error inserting chunk into the database: {e}")
connection.commit()
connection.close()
I've done print statements to print error messages during tokenization, chunking, and data insertion. I didn't get error messages. I made sure all the files were '"utf-8" encoded. I verified file structure and directory paths are consistent for all files.