3

We want to create a custom layer in tensorflow. Therefore, we decided to simply start with a toy example: a copy-Layer. After some try and error, we got to the point where it seems like the gradient would pass the right values through. However, in the second iteration the features get NAN's. It may be a simple mistake, but currently I can't see it.

In general, I have two questions:

  1. Can someone spot the problem here and how to solve it?
  2. What is a good method to debug a tensorflow session?

copy_op.cc

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include <stdio.h>

namespace tensorflow {



typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

template<typename Device, typename T>
class MyCopyOp: public OpKernel {
public:
    explicit MyCopyOp(OpKernelConstruction* context) :
            OpKernel(context) {
    }

    void Compute(OpKernelContext* context) override {
        const Tensor& input = context->input(0);
        auto in_flat = input.flat<T>();

        printf("Debug MyCopyOp Features: %s \n",input.DebugString().c_str());

        Tensor* output = nullptr;
        OP_REQUIRES_OK(context,
                context->allocate_output(0, input.shape(), &output));

        auto out_flat = output->flat<T>();
        out_flat.setZero();

        for (int d = 0; d < input.dims(); ++d) {
            for (int i = 0; i < input.dim_size(d); ++i) {
                out_flat(d * input.dim_size(d) + i) = in_flat(
                        d * input.dim_size(d) + i);
            }
        }

        printf("Debug MyCopyOp Output: %s \n",output->DebugString().c_str());
    }

};


template<typename Device, typename T>
class MyCopyGradOp: public OpKernel {
public:
    explicit MyCopyGradOp(OpKernelConstruction* context) :
            OpKernel(context) {

    }

    void Compute(OpKernelContext* context) override {
        printf("called MyCopyGradOp.Compute() \n");
        const Tensor& gradients = context->input(0);
        const Tensor& features = context->input(1);
        printf("Debug MyCopyOpGrad Gradients: %s \n",gradients.DebugString().c_str());
        printf("Debug MyCopyOpGrad Features: %s \n",features.DebugString().c_str());

        TensorShape output_shape = features.shape();

        Tensor* output = nullptr;
        OP_REQUIRES_OK(context,
                context->allocate_output(0, output_shape, &output));
        output->flat<T>().setZero();

        const T* btm_ptr = gradients.flat<T>().data();
        T* top_ptr = output->flat<T>().data();

        for (int i = 0; i < gradients.NumElements(); ++i) {
            top_ptr[i] = btm_ptr[i];
        }

        printf("Debug MyCopyOpGrad Output: %s \n",output->DebugString().c_str());
        printf("---------------------------------- \n");
    }

};


REGISTER_OP("MyCopy")
.Input("features: T")
.Output("output: T")
.Attr("T: realnumbertype")
.Doc(R"doc(
Copies all input values to the output
)doc");

REGISTER_OP("MyCopyGrad")
.Input("gradients: T")
.Input("features: T")
.Output("backprops: T")
.Attr("T: realnumbertype")
.Doc(R"doc(
TODO!!
)doc");


#define REGISTER_MYCOPY_KERNELS(type)                                           \
  REGISTER_KERNEL_BUILDER(                                                      \
      Name("MyCopy").Device(DEVICE_CPU).TypeConstraint<type>("T"),              \
      MyCopyOp<Eigen::ThreadPoolDevice, type>);                                 \
  REGISTER_KERNEL_BUILDER(                                                      \
      Name("MyCopyGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),          \
      MyCopyGradOp<Eigen::ThreadPoolDevice, type>);                             //  \
  // REGISTER_KERNEL_BUILDER(                                                      \
  //     Name("MyCopy").Device(DEVICE_GPU).TypeConstraint<type>("T"),              \
  //     MyCopyOp<Eigen::GpuDevice, type>);                                        \
  // REGISTER_KERNEL_BUILDER(                                                      \
  //     Name("MyCopyGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
  //     MyCopyGradOp<Eigen::GpuDevice, type>);                                


REGISTER_MYCOPY_KERNELS(float); 
REGISTER_MYCOPY_KERNELS(int);
REGISTER_MYCOPY_KERNELS(double);


}

We used the simple MNIST example as the basis:

layer_test.py

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

import tensorflow as tf
from tensorflow.python.framework import ops
copy_op_module = tf.load_op_library('copy_op.so')

@ops.RegisterGradient("MyCopy")
def _CopyOpGrad(op, grad):
  return copy_op_module.my_copy_grad(grad,op.inputs[0])

sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))

sess.run(tf.initialize_all_variables())

y1 = tf.nn.softmax(tf.matmul(x,W) + b)
y = copy_op_module.my_copy(y1)            //Here: MyCopy Layer is inserted

cross_entropy = -tf.reduce_sum(y_*tf.log(y))

train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

for i in range(2):
  batch = mnist.train.next_batch(50)
  train_step.run(feed_dict={x: batch[0], y_: batch[1]})

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))

compile

TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
g++ -std=c++11 -shared copy_op.cc -o copy_op.so -I $TF_INC -L $TF_LIB -fPIC -Wl,-rpath $TF_LIB

output:

Debug MyCopyOp Features: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
Debug MyCopyOp Output: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
called MyCopyGradOp.Compute() 
Debug MyCopyOpGrad Gradients: Tensor<type: float shape: [50,10] values: -0 -0 -0...> 
Debug MyCopyOpGrad Features: Tensor<type: float shape: [50,10] values: 0.1 0.1 0.1...> 
Debug MyCopyOpGrad Output: Tensor<type: float shape: [50,10] values: -0 -0 -0...> 
---------------------------------- 
Debug MyCopyOp Features: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOp Output: Tensor<type: float shape: [50,10] values: nan nan nan...> 
called MyCopyGradOp.Compute() 
Debug MyCopyOpGrad Gradients: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOpGrad Features: Tensor<type: float shape: [50,10] values: nan nan nan...> 
Debug MyCopyOpGrad Output: Tensor<type: float shape: [50,10] values: nan nan nan...> 
---------------------------------- 
Debug MyCopyOp Features: Tensor<type: float shape: [10000,10] values: nan nan nan...> 
Debug MyCopyOp Output: Tensor<type: float shape: [10000,10] values: nan nan nan...> 
0.098

Thanks a lot in advance!

MBT
  • 21,733
  • 19
  • 84
  • 102
avo
  • 31
  • 3
  • From the output, it looks like your `MyCopyOp` and `MyCopyGradOp` are working as intended. Can you confirm whether or not the weights become `NaN` without using the copy? (To do this, simply remove the copy layer, run a single training step, then call `y1.eval(feed_dict={x: batch[0], y_: batch[1]})` in the second iteration.) – mrry Mar 24 '16 at 16:07
  • For what it's worth, there are known stability issues with using `-tf.reduce_sum(y_ * tf.log(y))` to compute cross-entropy (use `tf.nn.softmax_cross_entropy_with_logits(y, y_)` instead), and initializing your `W` variable to zeros often leads to a worse outcome than initializing it randomly. (See [this answer](http://stackoverflow.com/a/36134261/3574081) for more discussion.) – mrry Mar 24 '16 at 16:31
  • Thanks for helping! 1. without using the copy layer y1 evals to `[[ 0.07910535 0.07910535 0.07910535 0.11042032 0.10930145 ...` while with copy the result after one step is `[[ nan nan nan nan nan ...` – avo Mar 24 '16 at 18:03
  • Does the gradient for `W` or `b` change when you add/remove the copy layer? You can get tensors for these by calling `W_grad, b_grad = tf.gradients(cross_entropy, [W, b])`, and then evaluate them with `sess.run([W_grad, b_grad], feed_dict={...})`. – mrry Mar 24 '16 at 18:20
  • Just using the tf.nn.softmax_cross_entropy_with_logits(y, y_) does the trick!! – avo Mar 24 '16 at 18:21

1 Answers1

0

from mrry in a comment: There are known stability issues with using -tf.reduce_sum(y_ * tf.log(y)) to compute cross-entropy (use tf.nn.softmax_cross_entropy_with_logits(y, y_) instead), and initializing your W variable to zeros often leads to a worse outcome than initializing it randomly. This answer has more detail about the weight initialization issue.

dga
  • 21,757
  • 3
  • 44
  • 51