4

I use tensorflow 0.9.
I want to save my model and then restore it.
I simply add tf.train.Saver() to save and restore my training variables.

This is my code:

import tensorflow as tf
import input_data
import os

checkpoint_dir='./ckpt_dir/'

mnist = input_data.read_data_sets("MNIST_data", one_hot = True)

x = tf.placeholder(tf.float32, shape = [None , 784])
y_ = tf.placeholder(tf.float32, [None, 10])

sess = tf.InteractiveSession()

def load_model(sess, saver, checkpoint_dir ):

ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
print(ckpt.model_checkpoint_path)

saver.restore(sess, ckpt.model_checkpoint_path)

else:
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
sess.run(init)
return

def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.1)
return tf.Variable(initial)

def bias_variable(shape):
initial = tf.constant(0.1, shape= shape)
return tf.Variable(initial)

def conv2d(x, W):
return tf.nn.conv2d(x, W, strides = [1, 1, 1, 1], padding = "SAME")

def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1],
padding = "SAME")

W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

x_image = tf.reshape(x, [-1, 28, 28, 1])

#
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1))
h_pool1 = max_pool_2x2(h_conv1)

#
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2))
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([7764, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7764])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

#
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

#
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop,W_fc2) +b_fc2)

#
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices = [1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

init = tf.initialize_all_variables()

saver = tf.train.Saver()

load_model(sess, saver, checkpoint_dir)

for i in range(1):
batch = mnist.train.next_batch(50)
if i%10 == 0:
train_accuracy = accuracy.eval(feed_dict = {x : batch[0] , y_ : batch[1], keep_prob : 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))

train_step.run(feed_dict = {x : batch[0], y_ : batch[1], keep_prob : 0.5})
print("test accuracy %g"%accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))

tf.scalar_summary("accuracy", accuracy)

saver.save(sess,checkpoint_dir+'model.ckpt')

When I restore the checkpoint:

saver.restore(sess, ckpt.model_checkpoint_path)

TensorFlow throws this error:

Traceback (most recent call last):
.
.
.
NotFoundError: Tensor name "global_step_7" not found in checkpoint files ./ckpt_dir/model.ckpt-0
[[Node: save_18/restore_slice_438 = RestoreSlicedt=DT_INT32, preferred_shard=-1, _device="/job:localhost/replica:0/task:0/cpu:0"]]
Caused by op 'save_18/restore_slice_438', defined at:
File "/home/m/anaconda3/lib/python3.5/site-packages/spyderlib/widgets/externalshell/start_ipython_kernel.py", line 205, in
ipythonkernel.start()
.
.
.
File "/home/m/anaconda3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1224, in __init
raise TypeError("Control input must be an Operation, "

EDIT:

I use anaconda. The first time I run this code in spyder or ipython with "run filename.py", it saves the model in the checkpoint, but when I run this code again, it throws the error.

But when I close spyder or ipython, open it again and run the code it restores the checkpoint correctly.

Also when I run in terminal "python filename.py" it always runs and doesn't throw any error.

Community
  • 1
  • 1
Tavakoli
  • 1,303
  • 3
  • 18
  • 36
  • The name `global_step_7` sound like you created 7 times the same tensor. Does it work if you restart ipython and create the graph only once? – Olivier Moindrot Jun 16 '16 at 12:27

1 Answers1

5

You need to reset the default graph at the beginning of your call when you run again the file.


If you don't reset the default graph, and run two times the line:

x = tf.Variable(1, name='x')
print x.name

You will see the first time that x has name "x:0" and the second time its name is "x_1:0". This is what confuses tf.train.Saver:

  • it first saves the value of x using name "x:0"
  • then in the next run you try to load the saved value of x, but now the name of the variable is "x_1:0", so the saver tries to load a saved value under the name "x_1:0" but cannot find it, and returns an error.

However, you can reset the default graph at the beginning using tf.reset_default_graph(). This will create an empty graph and use it as default graph.
Here the name of x can be the same in those two graphs:

# First run
tf.reset_default_graph()
x = tf.Variable(1, name='x')
print x.name  # prints 'x:0'

# Next run
tf.reset_default_graph()
x = tf.Variable(1, name='x')
print x.name  # prints 'x:0'

The two Variables can now have the same name because they are no longer in the same graph.


Another way of doing it is to create a graph at the beginning and use it as default graph:

graph = tf.Graph()
with graph.as_default():
    x = tf.Variable(1, name='x')
Olivier Moindrot
  • 27,908
  • 11
  • 92
  • 91
  • Thanks for user help. But when I add them in my code it throw this error: ValueError: Tensor("Reshape:0", shape=(?, 48, 48, 1), dtype=float32) must be from the same graph as Tensor("weight:0", shape=(5, 5, 1, 100), dtype=float32_ref). – Tavakoli Jun 29 '16 at 09:28
  • 1
    You have to put `tf.reset_default_graph()` at the very beginning of your code, before creating any tensor. – Olivier Moindrot Jun 29 '16 at 09:35
  • I use `tf.reset_default_graph()` in the `main` function and call `model` function to create model. when cal `conv2d` this error show. – Tavakoli Jun 29 '16 at 14:19
  • You must have created the filter of shape `[5, 5, 1, 100]` before calling `tf.reset_default_graph()`. I can't see where without your full code. – Olivier Moindrot Jun 29 '16 at 14:24
  • I had something like this out of `model()` function: `weights = { 'W_conv1' : weight_variable([5 , 5, IMAGE_CHANNELS, 100]), .... }`. I bring `weights` and `biases` inside the `model()` function and it worked. Thanks for your help. – Tavakoli Jun 29 '16 at 18:48