I have the following code (below) to perform tensorflow synchronous gradient descent based on their example here: https://github.com/tensorflow/models/blob/master/inception/inception/inception_distributed_train.py
Run With
The command I'm using for running this session on a cluster i have access to is:
(python2 SyncSGD.py --job_name='ps' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=1 &) && (python2 SyncSGD.py --job_name='worker' --task_index==2 &)
The Problem
For some reason the additional worker threads never really advance past the sv.prepare_or_wait_for_session
call. The master thread (FLAGS.task_index==0
) runs one full epoch of operations and then just hangs or times out. After killing all the threads or waiting for timeout the the other worker threads advance past the call to sv.prepare_or_wait_for_session
; however, they print indication that they've made it to the first epoch and hang.
Non reproducible
This also doesn't happen all the time. Sometimes the master never gets past the first epoch's run opteration, and the other two end up eventually catching up after about 30 sec and then also get hung at the run-op. Other times some threads will make it through some random number of epochs of training (one thread will make 2 epochs, the other two only 1). Sometimes, it even finishes. I have no idea what causes this; however, it seems like there's huge synchronization problems with the Supervisor(coordinator wrapper) and SyncReplicasOptimizer. I have searched profusely and still have no idea what to actually search to fix this and it's really my last resort.
The Code
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.platform import app
import sys
import time
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
class SynchSGD:
def __init__(self,parameter_servers,workers ):
self.parameter_servers=parameter_servers
self.workers=workers
self.cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# start a server for a specific task
self.server = tf.train.Server(self.cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
def run(self,fetches,fetches_format,dataset,batch_size=1,test_dataset=None,learning_rate=0.001,test_fetches_format=None,training_epochs=20, logs_path='/tmp/mnist/1'):
if FLAGS.job_name == "ps":
self.server.join()
elif FLAGS.job_name == "worker":
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d/cpu:0" % (FLAGS.task_index),#FLAGS.task_index),
cluster=self.cluster)):
print(str(FLAGS.task_index),"b4 gloabl step")
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False)
print(str(FLAGS.task_index),"b4 fetches")
inputs,fetches=fetches(learning_rate,global_step)
if FLAGS.task_index == 0:
chief_queue_runner = fetches[-1]
init_token_op = fetches[-2]
init_op = tf.initialize_all_variables()
print(str(FLAGS.task_index),"b4 sv")
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op,
logdir=logs_path)
begin_time = time.time()
frequency = 100
print(str(FLAGS.task_index),"b4 sesh")
with sv.prepare_or_wait_for_session(self.server.target) as sess:
# is chief
print(str(FLAGS.task_index),"b4 qr")
queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
sv.start_queue_runners(sess, queue_runners)
print(str(FLAGS.task_index)," after qr")
if FLAGS.task_index == 0:
sv.start_queue_runners(sess, [chief_queue_runner])
sess.run(init_token_op)
print(str(FLAGS.task_index)," after cqr")
if 'summary' in fetches_format:
# create log writer object (this will log on every machine)
writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
start_time = time.time()
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(dataset.num_examples/batch_size)
count = 0
print(str(FLAGS.task_index),str(epoch))
for i in range(batch_count):
batch_x, batch_y = dataset.next_batch(batch_size)
# perform the operations we defined earlier on batch
result = sess.run(
fetches[:-2],
feed_dict={inputs[0]: batch_x, inputs[1]: batch_y})
print(str(FLAGS.task_index),str(i))
if 'summary' in fetches_format:
writer.add_summary(result[fetches_format['summary']], result[fetches_format['step']])
count += 1
if count % frequency == 0 or i+1 == batch_count:
elapsed_time = time.time() - start_time
start_time = time.time()
std_out=''
count = 0
#sv.stop()
def main(argv=None):
# cluster specification
parameter_servers = ["localhost:2222"]
workers = [ "localhost:2223",
"localhost:2224",
"localhost:2225"]
# config
batch_size = 100
learning_rate = 0.001
training_epochs = 3
logs_path = "/rscratch/cs194/psharing-neural-nets/sync-logging"
#create variables for model
def fetches(learning_rate, global_step):
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
tf.set_random_seed(1)
with tf.name_scope("weights"):
W1 = tf.Variable(tf.random_normal([784, 100]))
W2 = tf.Variable(tf.random_normal([100, 10]))
# bias
with tf.name_scope("biases"):
b1 = tf.Variable(tf.zeros([100]))
b2 = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
z2 = tf.add(tf.matmul(x,W1),b1)
a2 = tf.nn.sigmoid(z2)
z3 = tf.add(tf.matmul(a2,W2),b2)
y = tf.nn.softmax(z3)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
grad_op = tf.train.GradientDescentOptimizer(learning_rate)
print('len workers %d: '%FLAGS.task_index,str(len(workers)))
rep_op = tf.train.SyncReplicasOptimizer(grad_op,
replicas_to_aggregate=len(workers),
replica_id=FLAGS.task_index,
total_num_replicas=len(workers),
use_locking=True
)
grads = rep_op.compute_gradients(cross_entropy)
apply_gradients_op = rep_op.apply_gradients(grads,global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op=tf.identity(cross_entropy,name='train_op')
#train_op = rep_op.minimize(cross_entropy, global_step=global_step)
#train_op = grad_op.minimize(cross_entropy, global_step=global_step)
init_token_op = rep_op.get_init_tokens_op()
chief_queue_runner = rep_op.get_chief_queue_runner()
with tf.name_scope('Accuracy'):
# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.scalar_summary("cost", cross_entropy)
tf.scalar_summary("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.merge_all_summaries()
return [x,y_],[train_op, cross_entropy, summary_op, global_step, accuracy, init_token_op, chief_queue_runner]
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
dataset = input_data.read_data_sets('MNIST_data', one_hot=True).train
test_dataset = input_data.read_data_sets('MNIST_data', one_hot=True).test
fetches_format = {'train':0,'cost':1,'summary':2,'step':3}
test_fetches_format = {'accuracy':0}
sgd=SynchSGD(parameter_servers,workers)
sgd.run(fetches,fetches_format,dataset,batch_size=batch_size,learning_rate=learning_rate,test_dataset=test_dataset,training_epochs=training_epochs, logs_path=logs_path)
if __name__=="__main__":
app.run()
Other Bugs
For some reason it seems to be creating a bunch of other jobs (looks like a job for communication with the other jobs maybe) producing the following mem allocations on 8 K40s (the python2 jobs)
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 26489 C ./train-imagenet-sgd 8598MiB |
| 0 28733 C python2 389MiB |
| 0 28735 C python2 123MiB |
| 0 28737 C python2 106MiB |
| 0 28739 C python2 2976MiB |
| 1 26489 C ./train-imagenet-sgd 8598MiB |
| 1 28733 C python2 206MiB |
| 1 28735 C python2 306MiB |
| 1 28737 C python2 106MiB |
| 1 28739 C python2 2976MiB |
| 2 26489 C ./train-imagenet-sgd 8598MiB |
| 2 28733 C python2 206MiB |
| 2 28735 C python2 306MiB |
| 2 28737 C python2 106MiB |
| 2 28739 C python2 2974MiB |
| 3 26489 C ./train-imagenet-sgd 8598MiB |
| 3 28733 C python2 206MiB |
| 3 28735 C python2 306MiB |
| 3 28737 C python2 106MiB |
| 3 28739 C python2 2974MiB |
| 4 26489 C ./train-imagenet-sgd 8598MiB |
| 4 28733 C python2 206MiB |
| 4 28735 C python2 286MiB |
| 4 28737 C python2 106MiB |
| 4 28739 C python2 2976MiB |
| 5 26489 C ./train-imagenet-sgd 8598MiB |
| 5 28733 C python2 2976MiB |
| 5 28735 C python2 306MiB |
| 5 28737 C python2 106MiB |
| 5 28739 C python2 206MiB |
| 6 26489 C ./train-imagenet-sgd 8704MiB |
| 6 28733 C python2 206MiB |
| 6 28735 C python2 106MiB |
| 6 28737 C python2 106MiB |
| 6 28739 C python2 106MiB |
| 7 26489 C ./train-imagenet-sgd 8598MiB |
| 7 28733 C python2 107MiB |
| 7 28735 C python2 106MiB |
| 7 28737 C python2 106MiB |
| 7 28739 C python2 106MiB |
+-----------------------------------------------------------------------------+
If the other jobs weren't there it would've allocated way more jobs (and still erred). Producing this giant stream of failed mem allocations:
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
I've tried for 4 days to get this to work and it's due tmrw so any help is appreciated