I'm trying to run a hugging face model, mode exactly "cardiffnlp/twitter-roberta-base-sentiment" on threads. But at the same time, I want just one single instance of it because it's really costly in terms of time.
In other words, I have multiple CSV files (several thousand) and each of them has around 20k-30k lines and I want that each line from all of them to be executed by the huggingface model, as you probably can imagine already this is the reason why I don't want to instantiate a model for each thread (where each thread would be used just to read one line and write it in the database). The problem with my approach is that when I'm running the code is going to give me an error from huggingface model.
RuntimeError: Already borrowed
Could any of you help me to understand how cand I fix it?
Hugging face model:
class EmotionDetection(object):
def __init__(self, model_name="cardiffnlp/twitter-roberta-base-sentiment"):
self.model_name = model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True,
task="sentiment-analysis", device=0)
def get_emotion_by_label(self, label: str):
if label == "LABEL_0":
return "negative"
elif label == "LABEL_1":
return "neutral"
elif label == "LABEL_2":
return "positive"
else:
print("SOMETHING IS WRONG")
return ""
def get_emotion(self, phrase):
results = self.classifier(phrase)
res = dict()
for result in results:
for emotion in result:
res.update({self.get_emotion_by_label(emotion['label']): emotion['score']})
return res
My code for generating database:
class GenerateDbThread(object):
def __init__(self, text: str, created_at: datetime.datetime, get_emotion_function, cursor, table_name):
self.table_name = table_name
self.text = text
self.created_at = created_at
emotions = get_emotion_function(self.text)
self.pos = emotions['positive']
self.neg = emotions['negative']
self.neu = emotions['neutral']
self.cursor = cursor
def execute(self):
query = f"INSERT INTO {self.table_name}(date, positive, negative, neutral, tweet) " \
f"VALUES (datetime('{str(self.created_at)}'),{self.pos},{self.neg},{self.neu}, '{self.text}')"
self.cursor.execute(query)
self.cursor.commit()
def get_all_data_files_path(data_dir: str):
return [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
def run(file: str, table_name: str):
df = pd.read_csv(os.path.join('data', file), delimiter=',')
for index, row in df.iterrows():
text = row['tweet']
language = row['language']
split_data = row['created_at'].split(" ")
GTB_Time = f"{split_data[2]} {split_data[3]} {split_data[4]}"
created_at = datetime.datetime.strptime(row['created_at'], f"%Y-%m-%d %H:%M:%S {GTB_Time}")
if language == "en":
GenerateDbThread(text, created_at, emotion_detector.get_emotion, cursor, table_name)
def init_db(db_name, table_name):
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
uid INTEGER PRIMARY KEY AUTOINCREMENT,
date DATETIME NOT NULL,
positive REAL NOT NULL,
negative REAL NOT NULL,
neutral REAL NOT NULL,
text TEXT NOT NULL
)""")
cursor.execute(f"CREATE INDEX IF NOT EXISTS ix_tweets_index ON {table_name}(uid)")
cursor.close()
ex = ThreadPoolExecutor(max_workers=10)
files = get_all_data_files_path('data')
init_db("DB_NAME.db", "TABLE_NAME")
emotion_detector = EmotionDetection()
conn = sqlite3.connect("DB_NAME.db")
cursor = conn.cursor()
pbar = tqdm(total=len(files))
futures = [ex.submit(run, file, "TABLE_NAME") for file in files]
for future in futures:
res = future.result()
pbar.update(1)
pbar.close()