8

I'm trying to run a hugging face model, mode exactly "cardiffnlp/twitter-roberta-base-sentiment" on threads. But at the same time, I want just one single instance of it because it's really costly in terms of time.

In other words, I have multiple CSV files (several thousand) and each of them has around 20k-30k lines and I want that each line from all of them to be executed by the huggingface model, as you probably can imagine already this is the reason why I don't want to instantiate a model for each thread (where each thread would be used just to read one line and write it in the database). The problem with my approach is that when I'm running the code is going to give me an error from huggingface model.

RuntimeError: Already borrowed

Could any of you help me to understand how cand I fix it?

Hugging face model:

class EmotionDetection(object):
    def __init__(self, model_name="cardiffnlp/twitter-roberta-base-sentiment"):
        self.model_name = model_name
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True,
                                                     task="sentiment-analysis", device=0)

    def get_emotion_by_label(self, label: str):
        if label == "LABEL_0":
            return "negative"
        elif label == "LABEL_1":
            return "neutral"
        elif label == "LABEL_2":
            return "positive"
        else:
            print("SOMETHING IS WRONG")
            return ""

    def get_emotion(self, phrase):
        results = self.classifier(phrase)
        res = dict()
        for result in results:
            for emotion in result:
                res.update({self.get_emotion_by_label(emotion['label']): emotion['score']})
        return res

My code for generating database:

class GenerateDbThread(object):
    def __init__(self, text: str, created_at: datetime.datetime, get_emotion_function, cursor, table_name):
        self.table_name = table_name

        self.text = text
        self.created_at = created_at
        emotions = get_emotion_function(self.text)

        self.pos = emotions['positive']
        self.neg = emotions['negative']
        self.neu = emotions['neutral']

        self.cursor = cursor

    def execute(self):
        query = f"INSERT INTO {self.table_name}(date, positive, negative, neutral, tweet) " \
                f"VALUES (datetime('{str(self.created_at)}'),{self.pos},{self.neg},{self.neu}, '{self.text}')"
        self.cursor.execute(query)
        self.cursor.commit()


def get_all_data_files_path(data_dir: str):
    return [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]


def run(file: str, table_name: str):
    df = pd.read_csv(os.path.join('data', file), delimiter=',')
    for index, row in df.iterrows():
        text = row['tweet']
        language = row['language']
        split_data = row['created_at'].split(" ")
        GTB_Time = f"{split_data[2]} {split_data[3]} {split_data[4]}"
        created_at = datetime.datetime.strptime(row['created_at'], f"%Y-%m-%d %H:%M:%S {GTB_Time}")
        if language == "en":
            GenerateDbThread(text, created_at, emotion_detector.get_emotion, cursor, table_name)


def init_db(db_name, table_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        uid INTEGER PRIMARY KEY AUTOINCREMENT,
        date DATETIME NOT NULL,
        positive REAL NOT NULL,
        negative REAL NOT NULL,
        neutral REAL NOT NULL,
        text TEXT NOT NULL
    )""")
    cursor.execute(f"CREATE INDEX IF NOT EXISTS ix_tweets_index ON {table_name}(uid)")
    cursor.close()


ex = ThreadPoolExecutor(max_workers=10)

files = get_all_data_files_path('data')

init_db("DB_NAME.db", "TABLE_NAME")

emotion_detector = EmotionDetection()
conn = sqlite3.connect("DB_NAME.db")
cursor = conn.cursor()

pbar = tqdm(total=len(files))
futures = [ex.submit(run, file, "TABLE_NAME") for file in files]
for future in futures:
    res = future.result()
    pbar.update(1)
pbar.close()
Mircea
  • 1,671
  • 7
  • 25
  • 41
  • Hi @mircea, were you ever able to solve this? – dmn May 26 '21 at 18:37
  • Not rly... I just give up. – Mircea May 29 '21 at 16:55
  • 1
    That's too bad :( It looks like, if you're able to downgrade `transformers`, that should work for now: https://github.com/huggingface/tokenizers/issues/537 If you're using spaCy, there is a hack-y workaround: https://github.com/explosion/spaCy/issues/6879 :( – dmn May 30 '21 at 17:08
  • One "solution" (not sure if it satisfies your needs) is to use a `threading.Lock` to prevent the model from being used by multiple threads simultaneously. Of course, that may nullify the benefit of threads, but it'll prevent an error. – rkechols Jul 01 '22 at 18:13
  • This can help https://huggingface.co/docs/transformers/pipeline_webserver – alvas Mar 21 '23 at 01:23

0 Answers0