0

Good morning,

I'm trying to develop a little bot that can answer questions based on a small knowledge base loaded beforehand in text format.

I launch the bot I can ask my question but it only returns the question and the "context" but never an answer. I don't understand why.. I tried different modifications but nothing to do.

Here is the code if the community can enlighten me? :)

 #VERSION 0.2
from transformers import CamembertForCausalLM, CamembertTokenizer
import os
import torch
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# Charger le modèle CamemBERT pré-entraîné
model_name = "camembert/camembert-large"
model = CamembertForCausalLM.from_pretrained(model_name)
tokenizer = CamembertTokenizer.from_pretrained(model_name)
print("Modèle chargé :", model)
print("Tokenizer chargé :", tokenizer)
print("Tokens spéciaux :", tokenizer.special_tokens_map)
print("Mode décodeur :", model.config.is_decoder)

# Fonction pour charger les fichiers texte à partir d'un chemin
def load_text_data(company_id):
   # path = os.path.join("Bdd", str(user_id), str(company_id), db_name)
  #  path = os.path.join("DB", str(company_id))

    text_data = """
    Il fait un grand soleil aujourd'hui et il y a aussi un peu de vent chaud.
    """
    # for filename in os.listdir(path):
    #     with open(os.path.join(path, filename), 'r') as file:
    #         text_data += file.read() + "\n"
    return text_data

def preprocess_question(question):
    tokens = tokenizer.tokenize(question)
    preprocessed_question = " ".join(tokens)
    return preprocessed_question

def generate_response(preprocessed_question, company_id, max_length=512, num_beams=5, no_repeat_ngram_size=2):
    text_data = load_text_data(company_id)
    input_text = f"Question : {preprocessed_question}\nDonnées : {text_data}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=num_beams,
            no_repeat_ngram_size=no_repeat_ngram_size,
            early_stopping=True
        )
        response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Retourner uniquement la réponse générée
    return response

if __name__ == '__main__':
    question = input("Posez votre question : ")
   # user_id = input("ID utilisateur : ")
    #company_id = input("ID de la société : ")
    #db_name = input("Nom de la base de données : ")

    preprocessed_question = preprocess_question(question)
    response = generate_response(preprocessed_question,company_id=1)

    print("Réponse du modèle :", response)

Thanks in advance !

demenvil
  • 1,089
  • 1
  • 12
  • 25

0 Answers0