I’m trying to follow this tutorial to fine-tune bert for a NER task using my own dataset. https://www.philschmid.de/huggingface-transformers-keras-tf. Below is my shortened code, and the error due to the last line of the code. I’m new to all these, and thank you in advance for helping out!
# load dataset,
df_converters = {'tokens': ast.literal_eval, 'labels': ast.literal_eval}
train_df = pd.read_csv("train_df_pretokenization.csv", converters=df_converters)
train_df = train_df.head(10)
# model and pretrained tokenizer
model_ckpt = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# tokenization, and align labels
def tokenize_and_align_labels(batch):
tag2int = {'B-POI':0, 'B-STR':1, 'E-POI':2, 'E-STR':3, 'I-POI':4,
'I-STR':5, 'S-POI':6, 'S-STR':7, 'O':8}
#tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation = True, padding = True)
tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation = True)
labels=[]
for idx, label in enumerate(batch['labels']):
word_ids = tokenized_inputs.word_ids(batch_index = idx)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(tag2int[label[word_idx]])
else:
label_ids.append(tag2int[label[word_idx]])
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs['tags'] = labels
return tokenized_inputs
def encode_dataset(ds):
return ds.map(tokenize_and_align_labels, batched= True, batch_size=10, remove_columns=['labels','tokens', 'index'])
train_ds = Dataset.from_pandas(train_df)
train_ds_encoded = encode_dataset(train_ds)
# prepare model input
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
tf_train_dataset = train_ds_encoded.to_tf_dataset(
columns= ['input_ids', 'token_type_ids', 'attention_mask', 'tags'],
shuffle=False,
batch_size=5,
collate_fn=data_collator
)
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.
I thought data collator is supposed to take care of the padding work given the requested batch size, and I don’t understand why feeding in sequences of different lengths will cause this error. Indeed, the tutorial runs fine without specifying padding or truncation. My code will run if I add padding = True to the tokenizer in the function (the line I commented out in the function). But I don’t think it is the right place to add paddings.