I ran this block of code using TF 2.2.0, Keras and some TPU config:
try:
TPU_WORKER = os.environ["TPU_NAME"]
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print(f"Running on TPU: {tpu.cluster_spec().as_dict()['worker']}")
print(f"TPU_WORKER: {TPU_WORKER}")
except ValueError:
tpu = None
gpus = tf.config.experimental.list_logical_devices("GPU")
if tpu:
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
elif len(gpus) > 1: # multiple GPUs on the VM
strategy = tf.distribute.MirroredStrategy(gpus)
else:
strategy = tf.distribute.get_strategy()
and got this error message:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-27-a49335a43189> in <module>
15
16 if tpu:
---> 17 tf.config.experimental_connect_to_cluster(tpu)
18 tf.tpu.experimental.initialize_tpu_system(tpu)
19 strategy = tf.distribute.experimental.TPUStrategy(tpu)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/remote.py in connect_to_cluster(cluster_spec_or_resolver, job_name, task_index, protocol, make_master_device_default, cluster_device_filters)
181 context.set_server_def(server_def)
182 else:
--> 183 context.update_server_def(server_def)
184
185 if make_master_device_default and isinstance(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/context.py in update_server_def(server_def)
2137
2138 def update_server_def(server_def):
-> 2139 context().update_server_def(server_def)
2140
2141
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/context.py in update_server_def(self, server_def, keep_alive_secs)
596 # Current executor might have pending nodes that involves updated remote
597 # devices. Wait for them to finish before updating.
--> 598 self.executor.wait()
599 self.executor.clear_error()
600 pywrap_tfe.TFE_ContextUpdateServerDef(self._context_handle,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/executor.py in wait(self)
65 def wait(self):
66 """Waits for ops dispatched in this executor to finish."""
---> 67 pywrap_tfe.TFE_ExecutorWaitForAllPendingNodes(self._handle)
68
69 def clear_error(self):
InvalidArgumentError: {{function_node __inference_train_function_75067}} Compilation failure: XLA can't deduce compile time constant output shape for strided slice: [4,?], output shape must be a compile-time constant
[[{{node model/tf_op_layer_strided_slice/strided_slice}}]]
TPU compilation failed
[[tpu_compile_succeeded_assert/_6359544293025479410/_3]]
This error:
InvalidArgumentError: {{function_node __inference_train_function_75067}} Compilation failure: XLA can't deduce compile time constant output shape for strided slice: [4,?], output shape must be a compile-time constant
[[{{node model/tf_op_layer_strided_slice/strided_slice}}]]
TPU compilation failed
[[tpu_compile_succeeded_assert/_6359544293025479410/_3]]
did occur during the previous run and now since then, I can't re-run my code.
The workaround would be to restart the notebook instead and re-run it.
But then I get the same error elsewhere:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<timed exec> in <module>
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
64 def _method_wrapper(self, *args, **kwargs):
65 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 66 return method(self, *args, **kwargs)
67
68 # Running inside `run_distribute_coordinator` already.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
853 context.async_wait()
854 logs = tmp_logs # No error, now safe to assign to logs.
--> 855 callbacks.on_train_batch_end(step, logs)
856 epoch_logs = copy.copy(logs)
857
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
387 """
388 if self._should_call_train_batch_hooks:
--> 389 logs = self._process_logs(logs)
390 self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
391
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/callbacks.py in _process_logs(self, logs)
263 """Turns tensors into numpy arrays or Python scalars."""
264 if logs:
--> 265 return tf_utils.to_numpy_or_python_type(logs)
266 return {}
267
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/utils/tf_utils.py in to_numpy_or_python_type(tensors)
521 return t # Don't turn ragged or sparse tensors to NumPy.
522
--> 523 return nest.map_structure(_to_single_numpy_or_python_type, tensors)
524
/opt/conda/lib/python3.7/site-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
615
616 return pack_sequence_as(
--> 617 structure[0], [func(*x) for x in entries],
618 expand_composites=expand_composites)
619
/opt/conda/lib/python3.7/site-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
615
616 return pack_sequence_as(
--> 617 structure[0], [func(*x) for x in entries],
618 expand_composites=expand_composites)
619
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
517 def _to_single_numpy_or_python_type(t):
518 if isinstance(t, ops.Tensor):
--> 519 x = t.numpy()
520 return x.item() if np.ndim(x) == 0 else x
521 return t # Don't turn ragged or sparse tensors to NumPy.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in numpy(self)
959 """
960 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 961 maybe_arr = self._numpy() # pylint: disable=protected-access
962 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
963
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in _numpy(self)
927 return self._numpy_internal()
928 except core._NotOkStatusException as e:
--> 929 six.raise_from(core._status_to_exception(e.code, e.message), None)
930
931 @property
/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: {{function_node __inference_train_function_78422}} Compilation failure: XLA can't deduce compile time constant output shape for strided slice: [16,?], output shape must be a compile-time constant
[[{{node model/tf_op_layer_strided_slice/strided_slice}}]]
TPU compilation failed
[[tpu_compile_succeeded_assert/_626429452001451780/_8]]
when trying to train/fit the keras layered model, although from the above call-stack it's not clear at which point this error has occurred.
One more question would be, how do we clear the cache or buffer that is storing this error, so that we can reset the TPU and run our code again after making changes. And not have to restart sessions or kernels?