Incrementally training || pause&resume training, GPT2 language model'ing

Question

I'm currently trying to learn python - and at the same time learning machine learning with GPT-2 language modeling - i have had some problems, and i got over most of them, and finally got something decent running.

But... as most of you probably know, training your model takes alot of CPU/GPU power & time - time i can spare, but the problem is that i cant have it running non-stop on my home computer (yes i know i can rent a GPU @ google) - since i want be able to do anything else while training my model.

So i have the following questions:

Can i somehow stop and restart my models training? i read something about checkpoints, but their is so much outdated info on this topic - so i havent been able to figure it out.
Can i incrementally feed my model fx. 10% of my dataset, let it finish - and then next week feed it another 10% and so on? if so how?
Bonus question... is it better to aim for many epochs with a lower data set? or a larger dataset and more epochs? what is a good amount of epochs?

Packages:

Python, 3.7.9
Tensorflow-gpu 2.3.0
Tensorflow-estimator 2.3.0
Transformers 4.2.2
Tokenizers 0.9.4
cudatoolkit 10.1

Code - Tokenizer

from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

class BPE_token(object):
def __init__(self):
    self.tokenizer = Tokenizer(BPE())
    self.tokenizer.normalizer = Sequence([
        NFKC()
    ])
    self.tokenizer.pre_tokenizer = ByteLevel()
    self.tokenizer.decoder = ByteLevelDecoder()

def bpe_train(self, paths):
    trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(),         special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>"
    ])
    self.tokenizer.train(trainer, paths)

def save_tokenizer(self, location, prefix=None):
    if not os.path.exists(location):
        os.makedirs(location)
    self.tokenizer.model.save(location, prefix)

# ////////// TOKENIZE DATA ////////////
from pathlib import Pa th
import os# the folder 'text' contains all the files
paths = [str(x) for x in Path("./da_corpus/").glob("**/*.txt")]
tokenizer = BPE_token()# train the tokenizer model
tokenizer.bpe_train(paths)# saving the tokenized data in our specified folder
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

Code -- Model Trainer

save_path = 'tokenized_data'
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
paths = [str(x) for x in Path("./da_corpus/").glob("**/*.txt")]
# tokenizer = Tokenizer.from_file("./tokenized_data/tokenizer-wiki.json")
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)# creating the model
model = TFGPT2LMHeadModel(config)

single_string = ''
for filename in paths:
    with open(filename, "r", encoding='utf-8') as f:
        x = f.read()
    single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)
# print(string_tokenized)



examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 2000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
    examples.append(string_tokenized[i:i + block_size])
    inputs, labels = [], []


for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
num_epoch = 20
history = model.fit(dataset, epochs=num_epoch)


output_dir = './model_bn_custom/'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)


model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)

# save tokenizer
tokenizer.save_pretrained(output_dir)

Incrementally training || pause&resume training, GPT2 language model'ing

0 Answers0