1

I have the following Python code, along with a few GGML models. The goal is to summarize all my txt files using LLM models rather than sentence transformers. The first section checks the text spacing and converts it into a continuous line rather than paragraphs. The second section employs LLM to summarize the content and saves the outcome in a text file for each text in a specific folder.

The code provided works well, but lacks a header to prompt for summarization. Despite this, it doesn't display results in real-time like ChatGPT.

***I wish to see real-time updates, character by character, presented in the console, allowing me to monitor the progress and halt the process if needed. But i cannot figure out how. Do require some of guys' assistance in the following, and will greatly appreciate.

import os
import re
from llama_cpp import Llama

input_directory = r"C:\Users\Peter-Susan\Desktop\test"
output_directory = r"C:\Users\Peter-Susan\Desktop"


def join_lines_in_files(directory_path):
    try:
        # Get all text files in the directory
        files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]

        for file in files:
            file_path = os.path.join(directory_path, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                # Read the content of the file
                content = f.read()

            # Use regular expression to join lines within each paragraph with a single space
            content = re.sub(r'(?<=\S)\n+', ' ', content)
            
            # Use regular expression to remove spacing before the first word in the new line
            content = re.sub(r'\n\s+', '\n', content)
            
            # Use regular expression to ensure a single space between words when joining lines
            content = re.sub(r'\s+', ' ', content)

            # Overwrite the file with the updated content
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)

        print("Line joining and spacing removal completed successfully.")
    except Exception as e:
        print("An error occurred:", str(e))

# Define the path to the directory containing the text files
directory_path = input_directory

# Call the function to join lines and remove spacing in files in the specified directory
join_lines_in_files(directory_path)

# Load the Llama model
model_path = "./models/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=7)

def process_query(query, max_tokens=2048, temperature=0.1, top_p=0.5, stop=["#"]):
    try:
        # Generate the response using the query
        response = llm(
            query,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        response_text = response["choices"][0]["text"].strip()
        return response_text
    except Exception as e:
        print("Error generating response:", str(e))
        return None

def get_title_from_path(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

def process_text_file(input_file_path, output_directory):
    # Read text from the input file
    with open(input_file_path, "r", encoding="utf-8") as file:
        file_content = file.read()

    # Concatenate the header with the file content
    header = 'Summarize in detail with at least 50 words: '
    input_text = header + file_content

    print(input_text)

    # Process the input text using the Llama model
    if input_text:
        response = process_query(input_text)
        if response:
            # Write the generated output to a text file
            output_file_path = os.path.join(
                output_directory,
                f"{get_title_from_path(input_file_path)}_summarized.txt"
            )
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(response)
            print(f"Summarized content for '{input_file_path}' written to '{output_file_path}'.")
        else:
            print(f"Error generating response for '{input_file_path}'.")
    else:
        print(f"Error: Could not read text from '{input_file_path}'.")

if __name__ == "__main__":
    # List all text files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_directory, filename)
            process_text_file(file_path, output_directory)

Even bard, Chatgpt and Claude are not able to help, likely they dont have knowledge about Llama models. I wish to see real-time updates, character by character, presented in the console, allowing me to monitor the progress and halt the process if needed.

jackfood
  • 11
  • 1

0 Answers0