I want to train an Encoder-Decoder model which converts dates from a string format to a numeric format. For example, I want to convert April 22, 2019" to "2019-04-22".
Here's the code that I used to create the dataset:
months = {
1 : "January",
2 : "February",
3 : "March",
4 : "April",
5 : "May",
6 : "June",
7 : "July",
8 : "August",
9 : "September",
10 : "October",
11 : "November",
12 : "December"
}
def generate_dataset(num_examples, train_ratio, valid_ratio, test_ratio):
train_set = []
valid_set = []
test_set = []
longest_sequence_length = 19 # longest one is "September 29, 2021" + end of stream token
PADDING_TOKEN = "<pad>"
END_OF_STREAM_TOKEN = "<EOS>"
for i in range(0, int(num_examples * train_ratio)):
random_year = np.random.randint(1, 2022)
random_month = np.random.randint(1, 13)
random_day = np.random.randint(1, 29)
# I'm ignoring the fact that different months have different number of days in the above line of code
random_date_string = [months[random_month]] + [" "] + [str(random_day)] + [", "] + [str(random_year)]
for i in range(len(random_date_string), (longest_sequence_length - 1)):
random_date_string = random_date_string + [PADDING_TOKEN]
random_date_string = random_date_string + [END_OF_STREAM_TOKEN]
# I also probably don't need an explicit cast to string
random_date_numeric = [str(random_year)] + ["-"] + [str(random_month)] + ["-"] + [str(random_day)]
for i in range(len(random_date_numeric), (longest_sequence_length - 1)):
random_date_numeric = random_date_numeric + [PADDING_TOKEN]
random_date_numeric = random_date_numeric + [END_OF_STREAM_TOKEN]
train_set.append([random_date_string, random_date_numeric])
for i in range(int(num_examples * train_ratio), int(num_examples * train_ratio + num_examples * valid_ratio)):
random_year = np.random.randint(1, 2022)
random_month = np.random.randint(1, 13)
random_day = np.random.randint(1, 29)
random_date_string = [months[random_month]] + [" "] + [str(random_day)] + [", "] + [str(random_year)]
for i in range(len(random_date_string), (longest_sequence_length - 1)):
random_date_string = random_date_string + [PADDING_TOKEN]
random_date_string = random_date_string + [END_OF_STREAM_TOKEN]
random_date_numeric = [str(random_year)] + ["-"] + [str(random_month)] + ["-"] + [str(random_day)]
for i in range(len(random_date_numeric), (longest_sequence_length - 1)):
random_date_numeric = random_date_numeric + [PADDING_TOKEN]
random_date_numeric = random_date_numeric + [END_OF_STREAM_TOKEN]
valid_set.append([random_date_string, random_date_numeric])
for i in range(int(num_examples * train_ratio + num_examples * valid_ratio), num_examples):
random_year = np.random.randint(1, 2022)
random_month = np.random.randint(1, 13)
random_day = np.random.randint(1, 29)
random_date_string = [months[random_month]] + [" "] + [str(random_day)] + [", "] + [str(random_year)]
for i in range(len(random_date_string), (longest_sequence_length - 1)):
random_date_string = random_date_string + [PADDING_TOKEN]
random_date_string = random_date_string + [END_OF_STREAM_TOKEN]
random_date_numeric = [str(random_year)] + ["-"] + [str(random_month)] + ["-"] + [str(random_day)]
for i in range(len(random_date_numeric), (longest_sequence_length - 1)):
random_date_numeric = random_date_numeric + [PADDING_TOKEN]
random_date_numeric = random_date_numeric + [END_OF_STREAM_TOKEN]
test_set.append([random_date_string, random_date_numeric])
return train_set, valid_set, test_set
train_set, valid_set, test_set = generate_dataset(10000, 0.7, 0.1, 0.2)
train_set = np.array(train_set)
valid_set = np.array(valid_set)
test_set = np.array(test_set)
Here's how a data entry looks like (train_set[0]
):
array([['July', ' ', '6', ', ', '225', '<pad>', '<pad>', '<pad>',
'<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',
'<pad>', '<pad>', '<pad>', '<EOS>'],
['225', '-', '7', '-', '6', '<pad>', '<pad>', '<pad>', '<pad>',
'<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',
'<pad>', '<pad>', '<EOS>']], dtype='<U9')
Here's the dimension of my training set:
(7000, 2, 19)
I then proceed to add the start-of-sequence tokens:
START_OF_SEQUENCE_TOKEN = "<SOS>"
train_set_with_start_of_sequence_token = []
for entry in train_set:
new_entry_0 = np.insert(entry[0][:-1], 0, START_OF_SEQUENCE_TOKEN)
new_entry_1 = np.insert(entry[1][:-1], 0, START_OF_SEQUENCE_TOKEN)
new_entry = [new_entry_0, new_entry_1]
train_set_with_start_of_sequence_token.append(np.array(new_entry))
train_set_with_start_of_sequence_token = np.array(train_set_with_start_of_sequence_token)
seq_lengths = np.full([7000], 19) # these are the dimensions of my training set
Here's the Encoder-Decoder model:
import tensorflow_addons as tfa
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)
vocab_size = 2021 * 12 * 23 # the total number of possible dates
embed_size = 512
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)
encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]
sampler = tfa.seq2seq.sampler.TrainingSampler()
decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
decoder_embeddings, initial_state=encoder_state,
sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)
model = keras.models.Model(
inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit([train_set[:, 1, :], train_set_with_start_of_sequence_token[:, 1, :], seq_lengths], train_set[:, 0, :], epochs=2)
When I try to call the model's fit method, I get the following error:
Train on 7000 samples
Epoch 1/2
32/7000 [..............................] - ETA: 35:46
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
<ipython-input-19-e60cd6fd68df> in <module>
----> 1 history = model.fit([train_set[:, 1, :], train_set_with_start_of_sequence_token[:, 1, :], seq_lengths], train_set[:, 0, :], epochs=2)
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
340 mode=ModeKeys.TRAIN,
341 training_context=training_context,
--> 342 total_epochs=epochs)
343 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
344
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
126 step=step, mode=mode, size=current_batch_size) as batch_logs:
127 try:
--> 128 batch_outs = execution_function(iterator)
129 except (StopIteration, errors.OutOfRangeError):
130 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)
96 # `numpy` translates Tensors to values in Eager mode.
97 return nest.map_structure(_non_none_constant_value,
---> 98 distributed_function(input_fn))
99
100 return execution_function
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
566 xla_context.Exit()
567 else:
--> 568 result = self._call(*args, **kwds)
569
570 if tracing_count == self._get_tracing_count():
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
630 # Lifting succeeded, so variables are initialized and we can run the
631 # stateless function.
--> 632 return self._stateless_fn(*args, **kwds)
633 else:
634 canon_args, canon_kwds = \
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
2361 with self._lock:
2362 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2363 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2364
2365 @property
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
1609 if isinstance(t, (ops.Tensor,
1610 resource_variable_ops.BaseResourceVariable))),
-> 1611 self.captured_inputs)
1612
1613 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1690 # No tape is watching; skip to running the function.
1691 return self._build_call_outputs(self._inference_function.call(
-> 1692 ctx, args, cancellation_manager=cancellation_manager))
1693 forward_backward = self._select_forward_and_backward_functions(
1694 args,
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
543 inputs=args,
544 attrs=("executor_type", executor_type, "config_proto", config),
--> 545 ctx=ctx)
546 else:
547 outputs = execute.execute_with_cancellation(
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
~/anaconda3/envs/tf2/lib/python3.7/site-packages/six.py in raise_from(value, from_value)
UnimplementedError: Cast string to int32 is not supported
[[node Cast (defined at <ipython-input-19-e60cd6fd68df>:1) ]] [Op:__inference_distributed_function_5170]
Function call stack:
distributed_function
I also tried making a TensorFlow dataset, as so:
train_set_string = tf.data.Dataset.from_tensor_slices(train_set[:, 0, :])
train_set_numeric = tf.data.Dataset.from_tensor_slices(train_set[:, 1, :])
train_set_with_start_of_sequence_token_numeric = tf.data.Dataset.from_tensor_slices(train_set_with_start_of_sequence_token[:, 1, :])
train_set_string = train_set_string.batch(1).prefetch(1)
train_set_numeric = train_set_numeric.batch(1).prefetch(1)
train_set_with_start_of_sequence_token_numeric = train_set_with_start_of_sequence_token_numeric.batch(1).prefetch(1)
history = model.fit([train_set_numeric, train_set_with_start_of_sequence_token_numeric, seq_lengths], train_set_string, epochs=2)
If I try the above code, I get the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-24-4ddb13a87ce5> in <module>
----> 1 history = model.fit([train_set_numeric, train_set_with_start_of_sequence_token_numeric, seq_lengths], train_set_string, epochs=2)
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233 max_queue_size=max_queue_size,
234 workers=workers,
--> 235 use_multiprocessing=use_multiprocessing)
236
237 total_samples = _get_total_number_of_samples(training_data_adapter)
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
531 'at same time.')
532
--> 533 adapter_cls = data_adapter.select_data_adapter(x, y)
534
535 # Handle validation_split, we want to split the data and get the training
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in select_data_adapter(x, y)
996 "Failed to find data adapter that can handle "
997 "input: {}, {}".format(
--> 998 _type_name(x), _type_name(y)))
999 elif len(adapter_cls) > 1:
1000 raise RuntimeError(
ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>", "<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>"}), <class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>
If I try:
history = model.fit([np.array(train_set_numeric), np.array(train_set_with_start_of_sequence_token_numeric), seq_lengths], np.array(train_set_string), epochs=2)
I get:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-26-7bed963333ee> in <module>
----> 1 history = model.fit([np.array(train_set_numeric), np.array(train_set_with_start_of_sequence_token_numeric), seq_lengths], np.array(train_set_string), epochs=2)
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 max_queue_size=max_queue_size,
818 workers=workers,
--> 819 use_multiprocessing=use_multiprocessing)
820
821 def evaluate(self,
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
212 steps_per_epoch,
213 ModeKeys.TRAIN,
--> 214 validation_split=validation_split)
215 dist_utils.validate_callbacks(input_callbacks=callbacks,
216 optimizer=model.optimizer)
~/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/keras/distribute/distributed_training_utils.py in process_batch_and_step_size(strategy, inputs, batch_size, steps_per_epoch, mode, validation_split)
460 first_x_value = nest.flatten(inputs)[0]
461 if isinstance(first_x_value, np.ndarray):
--> 462 num_samples = first_x_value.shape[0]
463 if validation_split and 0. < validation_split < 1.:
464 num_samples = int(num_samples * (1 - validation_split))
IndexError: tuple index out of range
Can someone tell me what's going on here and how do I fix it?