Note: I am using the free TPU provided on Kaggle.
I want to tokenize the text using transformers such that I tokenize only the batch while training the model instead of first tokenizing the whole dataset and then creating batches from the tokenized dataset as it flows OOM and is also inefficient. Below is a basic overview of what I want
tokenizer = transformers.RobertaTokenizerFast.from_pretrained('roberta-base')
def tokenize(text, labels):
tokenized = tokenizer(text, padding=True, truncation=True, max_length=MAX_LEN)
ids = tokenized['input_ids']
mask = tokenized['attention_mask']
return (ids, mask), labels
train_dataset = tf.data.Dataset.from_tensor_slices((text, train_label_chunk)).batch(BATCH_SIZE)
train_dataset = train_dataset.map(tokenize)
Below is the error it gives. I won't share the whole trace as the error is pretty clear
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)
while should be solved by something like this
for i in train_dataset:
sample = i[0]
break
sample.numpy()[0].decode()
which gives a proper string but decoding every single tf.string
is not possible. Also, it gives an error anyway when I try this
def tokenize(text, labels):
text = text.numpy()
tokenized = tokenizer(text, padding=True, truncation=True, max_length=MAX_LEN)
ids = tokenized['input_ids']
mask = tokenized['attention_mask']
return (ids, mask), labels
error
AttributeError: in user code:
<ipython-input-37-857b904b7110>:2 tokenize *
text = text.numpy()
AttributeError: 'Tensor' object has no attribute 'numpy'
I am not sure why is it there but in any case, this can't be done. The following GitHub trace can also be seen on the same topic here
Below are some other things that I tried. First I created a new dataset class
class TrainDataset():
def __init__(self, text, label, batch_size):
self.text = text
self.label = label
self.batch_size = batch_size
def __len__(self):
return len(self.text) // self.batch_size
def __getitem__(self, idx):
text = self.text[idx*self.batch_size:(idx+1)*self.batch_size]
label = self.label[idx*self.batch_size:(idx+1)*self.batch_size]
return text, label
ds = TrainDataset()
def train_loop(train_dataset):
with strategy.scope():
for step, (x, y) in enumerate(train_dataset):
train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
inputs = (train_data['input_ids'], train_data['attention_mask'])
with tf.GradientTape() as tape:
preds = model(inputs, training=True)
loss_value = loss_fun(y, preds)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
break
train_loop(ds)
which yields the following error
ValueError: Please use `tf.keras.losses.Reduction.SUM` or `tf.keras.losses.Reduction.NONE` for loss reduction when losses are used with `tf.distribute.Strategy` outside of the built-in training loops. You can implement `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch size like:
with strategy.scope(): loss_obj = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) .... loss = tf.reduce_sum(loss_obj(labels, predictions)) * (1. / global_batch_size)
Please see https://www.tensorflow.org/tutorials/distribute/custom_training for more details.
After which, I changed loss_fun
to loss_object
as below (Also changed the activation of the last layer to get the logits)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
which gives the below mentioned error
RuntimeError: `apply_gradients() cannot be called in cross-replica context. Use `tf.distribute.Strategy.run` to enter replica context.
At this point I wrote all custom function
def train_step(inputs):
x, y = inputs
with tf.GradientTape() as tape:
predictions = model(x, training=True)
loss = compute_loss(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_accuracy.update_state(y, predictions)
return loss
@tf.function
def distributed_train_step(dataset_inputs):
x, y = dataset_inputs
train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
inputs = (train_data['input_ids'], train_data['attention_mask'])
dataset_inputs = (inputs, y)
per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
axis=None)
for epoch in range(2):
# TRAIN LOOP
total_loss = 0.0
num_batches = 0
for x in tqdm(ds):
total_loss += distributed_train_step(x)
num_batches += 1
train_loss = total_loss / num_batches
template = ("Epoch {}, Loss: {}, Accuracy: {}")
print(template.format(epoch+1, train_loss, train_accuracy.result()*100))
train_accuracy.reset_states()
which fortunately did ran but gave the below error
StagingError Traceback (most recent call last)
<ipython-input-24-2cda132cf9fa> in <module>
4 num_batches = 0
5 for x in tqdm(ds):
----> 6 total_loss += distributed_train_step(x)
7 num_batches += 1
8 train_loss = total_loss / num_batches
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
860 # In this case we have not created variables on the first call. So we can
861 # run the first trace but we should fail if variables are created.
--> 862 results = self._stateful_fn(*args, **kwds)
863 if self._created_variables:
864 raise ValueError("Creating variables on a non-first call to a function"
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2939 with self._lock:
2940 (graph_function,
-> 2941 filtered_flat_args) = self._maybe_define_function(args, kwargs)
2942 return graph_function._call_flat(
2943 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3359
3360 self._function_cache.missed.add(call_context_key)
-> 3361 graph_function = self._create_graph_function(args, kwargs)
3362 self._function_cache.primary[cache_key] = graph_function
3363
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3204 arg_names=arg_names,
3205 override_flat_arg_shapes=override_flat_arg_shapes,
-> 3206 capture_by_value=self._capture_by_value),
3207 self._function_attributes,
3208 function_spec=self.function_spec,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
988 _, original_func = tf_decorator.unwrap(python_func)
989
--> 990 func_outputs = python_func(*func_args, **func_kwargs)
991
992 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
632 xla_context.Exit()
633 else:
--> 634 out = weak_wrapped_fn().__wrapped__(*args, **kwds)
635 return out
636
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
975 except Exception as e: # pylint:disable=broad-except
976 if hasattr(e, "ag_error_metadata"):
--> 977 raise e.ag_error_metadata.to_exception(e)
978 else:
979 raise
StagingError: in user code:
<ipython-input-19-9d8bdb5f7f7c>:4 distributed_train_step *
train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2305 __call__ *
**kwargs,
/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2490 batch_encode_plus *
**kwargs,
/opt/conda/lib/python3.7/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py:163 _batch_encode_plus *
return super()._batch_encode_plus(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_fast.py:418 _batch_encode_plus *
for key in tokens_and_encodings[0][0].keys():
IndexError: list index out of range
The IndexError: list index out of range
might be solved but the speed of training was really really slow and hence I think something is wrong.
At this point, any help will be highly appreciated.