I'm learning how to train generative models using the transformers library from HuggingFace, however, I keep having this error: TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
when I want to tokenize the text of my dataset.
Here's my code:
def construct_conv(row, tokenizer, eos = True):
flatten = lambda l: [item for sublist in l for item in sublist]
conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
conv = flatten(conv)
return conv
class ConversationDataset(Dataset):
def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
directory = args.cache_dir
cached_features_file = os.path.join(
directory, args.model_type + "_cached_lm_" + str(block_size)
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
with open(cached_features_file, "rb") as handle:
self.examples = pickle.load(handle)
else:
logger.info("Creating features from dataset file at %s", directory)
self.examples = []
for _, row in df.iterrows():
conv = construct_conv(row, tokenizer)
self.examples.append(conv)
logger.info("Saving features into cached file %s", cached_features_file)
with open(cached_features_file, "wb") as handle:
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __len__(self):
return len(self.examples)
def __getitem__(self, item):
return torch.tensor(self.examples[item], dtype=torch.long)
# Cacheing and storing of data/checkpoints
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
ordering_and_checkpoint_path = []
glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
for path in glob_checkpoints:
if use_mtime:
ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
else:
regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
if regex_match and regex_match.groups():
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
return checkpoints_sorted
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
if not args.save_total_limit:
return
if args.save_total_limit <= 0:
return
# Check if we should delete older checkpoint(s)
checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
if len(checkpoints_sorted) <= args.save_total_limit:
return
number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
for checkpoint in checkpoints_to_be_deleted:
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
shutil.rmtree(checkpoint)
def main(df_trn, df_val):
args = Args()
if args.should_continue:
sorted_checkpoints = _sorted_checkpoints(args)
if len(sorted_checkpoints) == 0:
raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
else:
args.model_name_or_path = sorted_checkpoints[-1]
if (
os.path.exists(args.output_dir)
and os.listdir(args.output_dir)
and args.do_train
and not args.overwrite_output_dir
and not args.should_continue
):
raise ValueError(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
args.output_dir
)
)
# Setup CUDA, GPU & distributed training
device = torch.device("cuda")
args.n_gpu = torch.cuda.device_count()
args.device = device
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
args.local_rank,
device,
args.n_gpu,
bool(args.local_rank != -1),
args.fp16,
)
# Set seed
set_seed(args)
config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
model = AutoModelWithLMHead.from_pretrained(
args.model_name_or_path,
from_tf=False,
config=config,
cache_dir=args.cache_dir,
)
model.to(args.device)
logger.info("Training/evaluation parameters %s", args)
print(tokenizer)
# Training
if args.do_train:
train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
# Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
if args.do_train:
# Create output directory if needed
os.makedirs(args.output_dir, exist_ok=True)
logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = (
model.module if hasattr(model, "module") else model
) # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
# Load a trained model and vocabulary that you have fine-tuned
model = AutoModelWithLMHead.from_pretrained(args.output_dir)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
model.to(args.device)
# Evaluation
results = {}
if args.do_eval and args.local_rank in [-1, 0]:
checkpoints = [args.output_dir]
if args.eval_all_checkpoints:
checkpoints = list(
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
)
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints:
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
model = AutoModelWithLMHead.from_pretrained(checkpoint)
model.to(args.device)
result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
results.update(result)
return results
And my dataset it's this one:
ID | reponse | context | context/0 | context/1 | context/2 | context/3 | context/4 | context/5 | context/6 | context/7 | context/8 | context/9 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
36917 | Es tan simple. | No se que te detiene. | ¡Muy persuasiva! | No es crimen librarse de una alimaña. | ¿Por qué tener lastima de un hombre tan vil? | Además, también ha puesto sus ojos en Okayo. | Hace 4 años que soy victima de mi marido. | Solo estoy siendo franca contigo. | Calmate. | ¡Eres peor que el diablo! | ¿Comprendes? | Okayo me recuerda constantemente mi fracaso. |
5449 | Muy torpe, Joyce. | A la sala de interrogación rápido. ¡Muévanse! | De pie, muchachos. | A la sala de interrogación rápido. ¡Muévanse! | De pie, muchachos. | ¡Use su cuchillo, hombre! | ¡Adelántese, Thomson! | ¡Bien hecho, Jenkins! | Gracias. | Muy bien. | El bungaló del mayor Warden está al final del ... | Continúe, conductor. |
37004 | Pídemelo. | Sólo lo que quieras tú. | Ya no soy yo. | Eres preciosa y maravillosa. | ¿No? | Así te gustaré. | Haré y diré lo que quieras. | Nunca. | Así nunca querrás estar con otras, ¿verdad? | Siempre diré lo que tú desees y haré lo que tú... | Pero yo sí. | Pero... |
47077 | ¡Boris! | ¡Nicolás, que alegría a mi corazón, volviste! | ¡Regresan los Vencedores! | ¡Miren! | ¡Ahí vienen! | Está vivo. | Boris está vivo. | Dasha prometió avisarme cuando regrese. | Pero, en la fábrica dicen que él está en una u... | Tampoco hay noticias de Stepan. | ¡Quién sabe! | ¿Por qué entonces, no hay noticias de él? |
41450 | Entonces por qué no estamos en mejor situación... | Dora Hartley era una buena prueba. | Mire, lo que hace usted creer ¿Qué los indios ... | Aleja esa arma. | Buenas noches. | Es hora de ir a la cama. | Seguro. | Sí. recuerde que es un secreto. | Es bonita. | Está bien. | ¿Ann Martin? | Hola, Bax. |