I am trying to train my model using google cloud's TPUs. The model works fine on CPU and GPU, and I can run the TPU tutorials without any problems (so it is not a problem of connecting to TPUs). However, when I run my program on the TPU cloud I get an error. The most important line is probably the following:
NotImplementedError: Non-resource Variables are not supported inside TPU computations (operator name: training_op/update_2nd_caps/primary_to_first_fc/W/ApplyAdam/RefEnter)
And here is the full error in case there is something important there:
Traceback (most recent call last):
File "TPU_playground.py", line 85, in <module>
capser.train(input_fn=train_input_fn_tpu, steps=n_steps)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 366, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1119, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1132, in _train_model_default
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1992, in _call_model_fn
features, labels, mode, config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1107, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2223, in _model_fn
_train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2537, in _train_on_tpu_system
device_assignment=ctx.device_assignment)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 733, in shard
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 394, in replicate
device_assignment, name)[1]
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 546, in split_compile_and_replicate
outputs = computation(*computation_inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2530, in multi_tpu_train_steps_on_single_shard
single_tpu_train_step, [_INITIAL_LOSS])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 207, in repeat
cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 169, in while_loop
name="")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 3209, in while_loop
result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2941, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2878, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 120, in body_wrapper
outputs = body(*(inputs + dequeue_ops))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/training_loop.py", line 203, in body_wrapper
return [i + 1] + _convert_to_list(body(*args))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1166, in train_step
self._call_model_fn(features, labels))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 1337, in _call_model_fn
estimator_spec = self._model_fn(features=features, **kwargs)
File "/home/adrien_doerig/capser/capser_7_model_fn.py", line 100, in model_fn_tpu
**output_decoder_deconv_params)
File "/home/adrien_doerig/capser/capser_model.py", line 341, in capser_model
loss_training_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step(), name="training_op")
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 409, in minimize
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py", line 114, in apply_gradients
return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 602, in apply_gradients
update_ops.append(processor.update_op(self, grad))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 113, in update_op
update_op = optimizer._apply_dense(g, self._v) # pylint: disable=protected-access
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/adam.py", line 148, in _apply_dense
grad, use_locking=self._use_locking).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/gen_training_ops.py", line 293, in apply_adam
use_nesterov=use_nesterov, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1782, in __init__
self._control_flow_post_processing()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1793, in _control_flow_post_processing
self._control_flow_context.AddOp(self)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2430, in AddOp
self._AddOpInternal(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2451, in _AddOpInternal
real_x = self.AddValue(x)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2398, in AddValue
self._outer_context.AddInnerOp(enter.op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 310, in AddInnerOp
self._AddOpInternal(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 287, in _AddOpInternal
"(operator name: %s)" % op.name)
NotImplementedError: Non-resource Variables are not supported inside TPU computations (operator name: training_op/update_2nd_caps/primary_to_first_fc/W/ApplyAdam/RefEnter)
It seems that the forward pass of the graph is built fine, but the backprop using AdamOptimizer is not supported by the TPUs in this case. I tried using more standard optimizers (GradientDescentOptimizer and MomentumOptimizer) but it doesn't help. All the tensors in the feedforward pass are in formats compatible with TPUs (i.e. tf.float32).
Does anyone have suggestions as to what I should try?
Thank you!