I'm currently trying to learn python - and at the same time learning machine learning with GPT-2 language modeling - i have had some problems, and i got over most of them, and finally got something decent running.
But... as most of you probably know, training your model takes alot of CPU/GPU power & time - time i can spare, but the problem is that i cant have it running non-stop on my home computer (yes i know i can rent a GPU @ google) - since i want be able to do anything else while training my model.
So i have the following questions:
- Can i somehow stop and restart my models training? i read something about checkpoints, but their is so much outdated info on this topic - so i havent been able to figure it out.
- Can i incrementally feed my model fx. 10% of my dataset, let it finish - and then next week feed it another 10% and so on? if so how?
- Bonus question... is it better to aim for many epochs with a lower data set? or a larger dataset and more epochs? what is a good amount of epochs?
Packages:
- Python, 3.7.9
- Tensorflow-gpu 2.3.0
- Tensorflow-estimator 2.3.0
- Transformers 4.2.2
- Tokenizers 0.9.4
- cudatoolkit 10.1
Code - Tokenizer
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
class BPE_token(object):
def __init__(self):
self.tokenizer = Tokenizer(BPE())
self.tokenizer.normalizer = Sequence([
NFKC()
])
self.tokenizer.pre_tokenizer = ByteLevel()
self.tokenizer.decoder = ByteLevelDecoder()
def bpe_train(self, paths):
trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>"
])
self.tokenizer.train(trainer, paths)
def save_tokenizer(self, location, prefix=None):
if not os.path.exists(location):
os.makedirs(location)
self.tokenizer.model.save(location, prefix)
# ////////// TOKENIZE DATA ////////////
from pathlib import Pa th
import os# the folder 'text' contains all the files
paths = [str(x) for x in Path("./da_corpus/").glob("**/*.txt")]
tokenizer = BPE_token()# train the tokenizer model
tokenizer.bpe_train(paths)# saving the tokenized data in our specified folder
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)
Code -- Model Trainer
save_path = 'tokenized_data'
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
paths = [str(x) for x in Path("./da_corpus/").glob("**/*.txt")]
# tokenizer = Tokenizer.from_file("./tokenized_data/tokenizer-wiki.json")
tokenizer.add_special_tokens({
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
})# creating the configurations from which the model can be made
config = GPT2Config(
vocab_size=tokenizer.vocab_size,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)# creating the model
model = TFGPT2LMHeadModel(config)
single_string = ''
for filename in paths:
with open(filename, "r", encoding='utf-8') as f:
x = f.read()
single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)
# print(string_tokenized)
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 2000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
inputs.append(ex[:-1])
labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
num_epoch = 20
history = model.fit(dataset, epochs=num_epoch)
output_dir = './model_bn_custom/'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(output_dir)