I'm trying to fine tune a t5 model for paraphrasing Farsi sentences. I'm using this model as my base. My dataset is a paired sentence dataset which each row is a pair of paraphrased sentences. I want to fine tune the model on this dataset. The problem is after each epoch I want to save the vocabulary and save the pretrained in order to use them later. However I get this error:
ValueError: Your fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.
When I tried my code on the t5-base model, it worked fine. But this model didn't work.
I have searched in google for this problem, but I haven't got any related answers. Here is my code:
!pip install pytorch_lightning==1.7.7
!pip install transformers
!pip install sentencepiece
import os
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, random_split
from transformers.optimization import AdamW
from pytorch_lightning.callbacks import Callback
save_path = './Models/paraphrase'
!mkdir -p $save_path
class ParaphraseGenerator(pl.LightningModule):
def __init__(self):
super().__init__()
model_name = 'erfan226/persian-t5-paraphraser'
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
self.batch_size = 16
self.lr = 4e-5
def encode_text(self, data_path):
with open(data_path, 'r', encoding='utf-8') as r:
data = json.load(r)
for item in tqdm(data):
# tokenizing original and paraphrase:
source = self.tokenizer(
item['sentence_1'], max_length=80, truncation=True, padding='max_length', return_tensors='pt')
target = self.tokenizer(
item['sentence_2'], max_length=200, truncation=True, padding='max_length', return_tensors='pt')
yield source['input_ids'], target['input_ids']
def to_tensor(self, source_ids, target_ids):
source_ids = torch.cat(source_ids, dim=0)
target_ids = torch.cat(target_ids, dim=0)
data = TensorDataset(source_ids, target_ids)
return random_split(data, [len(data), 0])[0]
def prepare_data(self):
train_path = "./train_dataset.json"
test_path = "./test_dataset.json"
source_ids, target_ids = list(
zip(*tuple(self.encode_text(train_path))))
self.train_ds = self.to_tensor(source_ids, target_ids)
source_ids, target_ids = list(
zip(*tuple(self.encode_text(test_path))))
self.test_ds = self.to_tensor(source_ids, target_ids)
def forward(self, batch, batch_idx):
source_ids, target_ids = batch[:2]
return self.model(input_ids=source_ids, labels=target_ids)
def training_step(self, batch, batch_idx):
loss = self(batch, batch_idx)[0]
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
loss = self(batch, batch_idx)[0]
self.log('val_loss', loss)
def train_dataloader(self):
return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size, drop_last=True, shuffle=True, num_workers=0)
def val_dataloader(self):
return torch.utils.data.DataLoader(self.test_ds, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=0)
def configure_optimizers(self):
return AdamW(self.parameters(), lr=self.lr, weight_decay=0.01)
class SaveCallback(Callback):
def on_epoch_start(self, trainer, pl_module):
if pl_module.current_epoch > 0:
current_epoch = str(pl_module.current_epoch)
fn = f'epoch_{current_epoch}'
new_path = f"{save_path}/{fn}/"
if fn not in os.listdir(save_path):
os.mkdir(new_path)
pl_module.tokenizer.save_vocabulary(new_path)
pl_module.model.save_pretrained(new_path)
trainer = pl.Trainer(
default_root_dir='logs',
min_epochs=4,
gpus=-1,
max_epochs=5,
val_check_interval=0.5,
callbacks=[SaveCallback()],
logger=pl.loggers.TensorBoardLogger('logs/', name='paraphrase', version=0)
)
para_model = ParaphraseGenerator()
trainer.fit(para_model)
In the SaveCallback function when I try to save_vocabulary the error occurs. Also, I'm using google colab to run this code.