Your fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer

Question

I'm trying to fine tune a t5 model for paraphrasing Farsi sentences. I'm using this model as my base. My dataset is a paired sentence dataset which each row is a pair of paraphrased sentences. I want to fine tune the model on this dataset. The problem is after each epoch I want to save the vocabulary and save the pretrained in order to use them later. However I get this error:

ValueError: Your fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.

When I tried my code on the t5-base model, it worked fine. But this model didn't work.

I have searched in google for this problem, but I haven't got any related answers. Here is my code:

!pip install pytorch_lightning==1.7.7
!pip install transformers
!pip install sentencepiece

import os
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, random_split
from transformers.optimization import AdamW
from pytorch_lightning.callbacks import Callback

save_path = './Models/paraphrase'

!mkdir -p $save_path

class ParaphraseGenerator(pl.LightningModule):
    def __init__(self):
        super().__init__()
        model_name = 'erfan226/persian-t5-paraphraser'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.batch_size = 16
        self.lr = 4e-5

    def encode_text(self, data_path):
        with open(data_path, 'r', encoding='utf-8') as r:
            data = json.load(r)
        for item in tqdm(data):
            # tokenizing original and paraphrase:
            source = self.tokenizer(
                item['sentence_1'], max_length=80, truncation=True, padding='max_length', return_tensors='pt')
            target = self.tokenizer(
                item['sentence_2'], max_length=200, truncation=True, padding='max_length', return_tensors='pt')
            yield source['input_ids'], target['input_ids']

    def to_tensor(self, source_ids, target_ids):
        source_ids = torch.cat(source_ids, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        data = TensorDataset(source_ids, target_ids)
        return random_split(data, [len(data), 0])[0]

    def prepare_data(self):
        train_path = "./train_dataset.json"
        test_path = "./test_dataset.json"
        source_ids, target_ids = list(
            zip(*tuple(self.encode_text(train_path))))
        self.train_ds = self.to_tensor(source_ids, target_ids)

        source_ids, target_ids = list(
            zip(*tuple(self.encode_text(test_path))))
        self.test_ds = self.to_tensor(source_ids, target_ids)

    def forward(self, batch, batch_idx):
        source_ids, target_ids = batch[:2]
        return self.model(input_ids=source_ids, labels=target_ids)

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('train_loss', loss)
        return loss


    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('val_loss', loss)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size, drop_last=True, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.test_ds, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=0)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr, weight_decay=0.01)

class SaveCallback(Callback):
    def on_epoch_start(self, trainer, pl_module):
        if pl_module.current_epoch > 0:
            current_epoch = str(pl_module.current_epoch)
            fn = f'epoch_{current_epoch}'
            new_path = f"{save_path}/{fn}/"
            if fn not in os.listdir(save_path):
                os.mkdir(new_path)
            pl_module.tokenizer.save_vocabulary(new_path)
            pl_module.model.save_pretrained(new_path)

trainer = pl.Trainer(
    default_root_dir='logs',
    min_epochs=4,
    gpus=-1,
    max_epochs=5,
    val_check_interval=0.5,
    callbacks=[SaveCallback()],
    logger=pl.loggers.TensorBoardLogger('logs/', name='paraphrase', version=0)
)


para_model = ParaphraseGenerator()
trainer.fit(para_model)

In the SaveCallback function when I try to save_vocabulary the error occurs. Also, I'm using google colab to run this code.

Your fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer

0 Answers0