MXNet HybridBlock model parallelism

Question

I am new to MXNet and have been experimenting with model parallelism. I found this nice post: simple example of mxnet model parallelism

and modified the code to use HybridBlock as follows:

import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import HybridBlock

num_inputs = 2
num_outputs = 1
num_examples = 10000


def real_fn(x):
    return 2 * x[:, 0] - 3.4 * x[:, 1] + 4.2


x = np.random.normal(0, 1, (num_examples, num_inputs))
noise = 0.001 * np.random.normal(0, 1, num_examples)
y = real_fn(x) + noise
y = y.reshape(-1, 1)

hidden_layers = 2
num_gpus = hidden_layers + 1
ctxList = [mx.gpu(i) for i in range(num_gpus)]


class MyDenseBlock(HybridBlock):
    def __init__(self, layer_number, size_input, size_output, **kwargs):
        super(MyDenseBlock, self).__init__(**kwargs)

        self.layer_number = layer_number
        self.size_input = size_input
        self.size_output = size_output

        with self.name_scope():
            # add parameters to the Block's ParameterDict.
            self.weight = self.params.get(
                'weight',
                init=mx.init.Xavier(magnitude=2.24),
                shape=(size_input, size_output),
                grad_req='write')

            self.bias = self.params.get(
                'bias',
                init=mx.init.Constant(0.5),
                shape=(size_output,),
                grad_req='write')

    def hybrid_forward(self, F, x, weight, bias):
        x = x.as_in_context(ctxList[self.layer_number])
        with x.context:
            linear = F.broadcast_add(F.dot(x, weight), bias)
            return linear


net = gluon.nn.HybridSequential()
with net.name_scope():
    net.add(MyDenseBlock(0, size_input=2, size_output=2))

    for i in range(hidden_layers - 1):
        net.add(MyDenseBlock(i + 1, size_input=2, size_output=2))

    net.add(MyDenseBlock(i + 2, size_input=2, size_output=1))


print("\ninitializing:")
params = net.collect_params()

for i, param in enumerate(params):
    if 'mydenseblock0' in param:
        params[param].initialize(ctx=ctxList[0])

    elif 'mydenseblock1' in param:
        params[param].initialize(ctx=ctxList[1])

    elif 'mydenseblock2' in param:
        params[param].initialize(ctx=ctxList[2])

    print("  ", i, param, "  ", params[param].list_data()[0].context)

#net.hybridize()


def square_loss(yhat, y):
    return nd.mean((yhat - y) ** 2)


def custom_trainer(updaters, params, ignore_stale_grad=False):
    for i, param in enumerate(params):
        if params[param].grad_req == 'null':
            continue
        if not ignore_stale_grad:
            for data in params[param].list_data():
                if not data._fresh_grad:
                    print("`%s` on context %s has not been updated" % (params[param].name, str(data.context)))
                    assert False

        for upd, arr, grad in zip(updaters, params[param].list_data(), params[param].list_grad()):
            if not ignore_stale_grad or arr._fresh_grad:
                upd(i, grad, arr)
                arr._fresh_grad = False


batch_size = 100
epochs = 100
iteration = -1

opt = mx.optimizer.create('adam', learning_rate=0.001, rescale_grad=1 / batch_size)
updaters = [mx.optimizer.get_updater(opt)]


results = []
for e in range(epochs):
    train_groups = np.array_split(np.arange(x.shape[0]), x.shape[0] / batch_size)
    for i, idx in enumerate(train_groups):
        iteration += 1
        xtrain, ytrain = x[idx, :], y[idx]

        xtrain = nd.array(xtrain)
        xtrain = xtrain.as_in_context(ctxList[0])

        ytrain = nd.array(ytrain).reshape((-1, 1))
        ytrain = ytrain.as_in_context(ctxList[0])

        with autograd.record():
            yhat = net(xtrain)
            loss = square_loss(yhat, ytrain.as_in_context(ctxList[-1]))

        loss.backward()
        custom_trainer(updaters, net.collect_params())

        if iteration % 10 == 0:
            results.append([iteration, loss.asnumpy().item()])
            print("epoch= {:5,d}, iter= {:6,d},  error= {:6.3E}".format(e, iteration, loss.asnumpy().item()))

However, I got the following error:

RuntimeError: Parameter 'hybridsequential0_mydenseblock1_weight' was 
not initialized on context gpu(0). It was only initialized on [gpu(1)].
terminate called recursively
terminate called after throwing an instance of 'dmlc::Error'

I had no problem using Blocks (as already confirmed in the post), it is just HybridBlock that's not working. Could someone please help? It seems to me that model parallelism examples are pretty scant.

score 1 · Answer 1 · answered May 07 '18 at 18:17

Could you please let me know what problem you are trying to solve with model parallelism? Given the recent GPUs, model parallelism is really not required for most models. Try to think if you can solve the problem using a different approach like:

Use FP16.
Reduce the number of parameters/layers in model.
Reduce the size of the input e.g. the number of input features.
Reduce batch size and do data parallelism.
Decompose the problem into submodels that can be trained separately.

If none of the above works for you and you are convinced you really need model parallelism, here is a sample code in Gluon that does model parallelism: https://github.com/indhub/mxnet_tutorials/blob/master/model_parallelism/ModelParallelism.ipynb

If you don't understand any part of that code, please ask and I can explain.

MXNet HybridBlock model parallelism

1 Answers1