1

I have been searching the web up and down but can't seem to find a simple answer.

Basically, I have a desktop with one GPU, and a laptop where my main code is at. My goal is to use distributed tensorflow to execute python code on my laptop while using GPU over IP.

Here's what I've tried so far:

# for the desktop with one GPU
import tensorflow as tf
cluster = tf.train.ClusterSpec(["worker": ["192.168.1.11:2222"]])
server = tf.distribute.Server(cluster, job_name="worker", task_index=0)
server.start()
server.join()

I have checked that 192.168.1.11:2222 is listening but the problem is my laptop won't connect.

# for laptop connecting to the desktop
import os
import json

os.environ["TF_CONFIG"] = json.dumps({
    "cluster": {
        "worker": ["192.168.1.11:2222"],
    },
  "task": {"type": "worker", "index": 0}
})


import tensorflow as tf
import numpy as np

strategy = tf.distribute.OneDeviceStrategy(device="/job:worker/task:0/device:GPU:0")
with strategy.scope():


    text = open('shakespeare.txt', 'rb').read().decode(encoding='utf-8')
    print ('Length of text: {} characters'.format(len(text)))
    vocab = sorted(set(text))
    print ('{} unique characters'.format(len(vocab)))

    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)

    text_as_int = np.array([char2idx[c] for c in text])
    tensored_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

    maxValue = 28
    seq_length = 100
    batch_size = 64

    slicedSequences = tensored_dataset.batch(seq_length, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text
    slicedInputOutput = slicedSequences.map(split_input_target)
    dataset = slicedInputOutput.batch(batch_size)

    vocab_size = len(vocab)
    embedding_dim = 256
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),)
    model.add(tf.keras.layers.LSTM(units=512, return_sequences=True))
    # model.add(tf.keras.layers.LSTM(units=1, return_sequences=True, input_shape=(seq_length, 1)))
    model.add(tf.keras.layers.Dense(units=len(vocab), activation='sigmoid'))

    model.summary()

    def loss(labels, logits):
      return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    model.compile(optimizer='adam', loss=loss)

    # model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(dataset, epochs=1)

    # def getPrediction(inputSentence):
    #     inputSentence = inputSentence[:seq_length]
    #     result = model.predict(np.array([dictionary[i] for i in inputSentence.lower()]).reshape(1, -1, 1).astype(np.float32)).reshape(-1)
    #     return ''.join([fromChar(dictionary, int(round(perChar*maxValue))) for perChar in result])
    # print(getPrediction('Hello world'))

    inputTest, outputTest = list(dataset.take(1))[0]
    def test():
        example_batch_predictions = model(inputTest)
        sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
        sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
        print("Input: \n", repr("".join(idx2char[inputTest[0]])))
        print()
        print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

Problem is when I define os.environ['TF_CONFIG'], this error occurs:

RuntimeError: /job:worker/replica:0/task:0/device:GPU:0 unknown device.

Any help/suggestions will be appreciated greatly.

Binary
  • 451
  • 5
  • 15

0 Answers0