Tokenize dataset using map on tf.data.Dataset.from_tensor_slices(....)

Question

Note: I am using the free TPU provided on Kaggle.

I want to tokenize the text using transformers such that I tokenize only the batch while training the model instead of first tokenizing the whole dataset and then creating batches from the tokenized dataset as it flows OOM and is also inefficient. Below is a basic overview of what I want

tokenizer = transformers.RobertaTokenizerFast.from_pretrained('roberta-base')

def tokenize(text, labels):
    tokenized = tokenizer(text, padding=True, truncation=True, max_length=MAX_LEN)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    return (ids, mask), labels

train_dataset = tf.data.Dataset.from_tensor_slices((text, train_label_chunk)).batch(BATCH_SIZE)
train_dataset = train_dataset.map(tokenize)

Below is the error it gives. I won't share the whole trace as the error is pretty clear

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)

while should be solved by something like this

for i in train_dataset:
    sample = i[0]
    break

sample.numpy()[0].decode()

which gives a proper string but decoding every single tf.string is not possible. Also, it gives an error anyway when I try this

def tokenize(text, labels):
    text = text.numpy()
    tokenized = tokenizer(text, padding=True, truncation=True, max_length=MAX_LEN)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    return (ids, mask), labels

error

AttributeError: in user code:
    <ipython-input-37-857b904b7110>:2 tokenize  *
        text = text.numpy()
    AttributeError: 'Tensor' object has no attribute 'numpy'

I am not sure why is it there but in any case, this can't be done. The following GitHub trace can also be seen on the same topic here

Below are some other things that I tried. First I created a new dataset class

class TrainDataset():
    def __init__(self, text, label, batch_size):
        self.text = text
        self.label = label
        self.batch_size = batch_size
        
    def __len__(self):
        return len(self.text) // self.batch_size
    
    def __getitem__(self, idx):
        text = self.text[idx*self.batch_size:(idx+1)*self.batch_size]
        label = self.label[idx*self.batch_size:(idx+1)*self.batch_size]
        return text, label

ds = TrainDataset()

def train_loop(train_dataset):
    with strategy.scope():
        for step, (x, y) in enumerate(train_dataset):
            train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
            inputs = (train_data['input_ids'], train_data['attention_mask'])
            with tf.GradientTape() as tape:
                preds = model(inputs, training=True)
                loss_value = loss_fun(y, preds)

            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            break

train_loop(ds)

which yields the following error

ValueError: Please use `tf.keras.losses.Reduction.SUM` or `tf.keras.losses.Reduction.NONE` for loss reduction when losses are used with `tf.distribute.Strategy` outside of the built-in training loops. You can implement `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch size like:

with strategy.scope(): loss_obj = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) .... loss = tf.reduce_sum(loss_obj(labels, predictions)) * (1. / global_batch_size)

Please see https://www.tensorflow.org/tutorials/distribute/custom_training for more details.

After which, I changed loss_fun to loss_object as below (Also changed the activation of the last layer to get the logits)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

which gives the below mentioned error

RuntimeError: `apply_gradients() cannot be called in cross-replica context. Use `tf.distribute.Strategy.run` to enter replica context.

At this point I wrote all custom function

def train_step(inputs):
    x, y = inputs
    with tf.GradientTape() as tape:
        predictions = model(x, training=True)
        loss = compute_loss(y, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_accuracy.update_state(y, predictions)
    return loss 

@tf.function
def distributed_train_step(dataset_inputs):
    x, y = dataset_inputs
    train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
    inputs = (train_data['input_ids'], train_data['attention_mask'])
    dataset_inputs = (inputs, y)
    per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                         axis=None)

for epoch in range(2):
    # TRAIN LOOP
    total_loss = 0.0
    num_batches = 0
    for x in tqdm(ds):
        total_loss += distributed_train_step(x)
        num_batches += 1
    train_loss = total_loss / num_batches
    
    template = ("Epoch {}, Loss: {}, Accuracy: {}")
    print(template.format(epoch+1, train_loss, train_accuracy.result()*100))

    train_accuracy.reset_states()

which fortunately did ran but gave the below error

StagingError                              Traceback (most recent call last)
<ipython-input-24-2cda132cf9fa> in <module>
      4     num_batches = 0
      5     for x in tqdm(ds):
----> 6         total_loss += distributed_train_step(x)
      7         num_batches += 1
      8     train_loss = total_loss / num_batches

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    826     tracing_count = self.experimental_get_tracing_count()
    827     with trace.Trace(self._name) as tm:
--> 828       result = self._call(*args, **kwds)
    829       compiler = "xla" if self._experimental_compile else "nonXla"
    830       new_tracing_count = self.experimental_get_tracing_count()

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    860       # In this case we have not created variables on the first call. So we can
    861       # run the first trace but we should fail if variables are created.
--> 862       results = self._stateful_fn(*args, **kwds)
    863       if self._created_variables:
    864         raise ValueError("Creating variables on a non-first call to a function"

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
   2939     with self._lock:
   2940       (graph_function,
-> 2941        filtered_flat_args) = self._maybe_define_function(args, kwargs)
   2942     return graph_function._call_flat(
   2943         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   3359 
   3360           self._function_cache.missed.add(call_context_key)
-> 3361           graph_function = self._create_graph_function(args, kwargs)
   3362           self._function_cache.primary[cache_key] = graph_function
   3363 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   3204             arg_names=arg_names,
   3205             override_flat_arg_shapes=override_flat_arg_shapes,
-> 3206             capture_by_value=self._capture_by_value),
   3207         self._function_attributes,
   3208         function_spec=self.function_spec,

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    988         _, original_func = tf_decorator.unwrap(python_func)
    989 
--> 990       func_outputs = python_func(*func_args, **func_kwargs)
    991 
    992       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    632             xla_context.Exit()
    633         else:
--> 634           out = weak_wrapped_fn().__wrapped__(*args, **kwds)
    635         return out
    636 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    975           except Exception as e:  # pylint:disable=broad-except
    976             if hasattr(e, "ag_error_metadata"):
--> 977               raise e.ag_error_metadata.to_exception(e)
    978             else:
    979               raise

StagingError: in user code:

    <ipython-input-19-9d8bdb5f7f7c>:4 distributed_train_step  *
        train_data = tokenizer(x, padding=True, truncation=True, max_length=MAX_LEN, return_tensors='tf')
    /opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2305 __call__  *
        **kwargs,
    /opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_base.py:2490 batch_encode_plus  *
        **kwargs,
    /opt/conda/lib/python3.7/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py:163 _batch_encode_plus  *
        return super()._batch_encode_plus(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/transformers/tokenization_utils_fast.py:418 _batch_encode_plus  *
        for key in tokens_and_encodings[0][0].keys():

    IndexError: list index out of range

The IndexError: list index out of range might be solved but the speed of training was really really slow and hence I think something is wrong. At this point, any help will be highly appreciated.

Tokenize dataset using map on tf.data.Dataset.from_tensor_slices(....)

0 Answers0