I am new to MXNet and have been experimenting with model parallelism. I found this nice post: simple example of mxnet model parallelism
and modified the code to use HybridBlock as follows:
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import HybridBlock
num_inputs = 2
num_outputs = 1
num_examples = 10000
def real_fn(x):
return 2 * x[:, 0] - 3.4 * x[:, 1] + 4.2
x = np.random.normal(0, 1, (num_examples, num_inputs))
noise = 0.001 * np.random.normal(0, 1, num_examples)
y = real_fn(x) + noise
y = y.reshape(-1, 1)
hidden_layers = 2
num_gpus = hidden_layers + 1
ctxList = [mx.gpu(i) for i in range(num_gpus)]
class MyDenseBlock(HybridBlock):
def __init__(self, layer_number, size_input, size_output, **kwargs):
super(MyDenseBlock, self).__init__(**kwargs)
self.layer_number = layer_number
self.size_input = size_input
self.size_output = size_output
with self.name_scope():
# add parameters to the Block's ParameterDict.
self.weight = self.params.get(
'weight',
init=mx.init.Xavier(magnitude=2.24),
shape=(size_input, size_output),
grad_req='write')
self.bias = self.params.get(
'bias',
init=mx.init.Constant(0.5),
shape=(size_output,),
grad_req='write')
def hybrid_forward(self, F, x, weight, bias):
x = x.as_in_context(ctxList[self.layer_number])
with x.context:
linear = F.broadcast_add(F.dot(x, weight), bias)
return linear
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(MyDenseBlock(0, size_input=2, size_output=2))
for i in range(hidden_layers - 1):
net.add(MyDenseBlock(i + 1, size_input=2, size_output=2))
net.add(MyDenseBlock(i + 2, size_input=2, size_output=1))
print("\ninitializing:")
params = net.collect_params()
for i, param in enumerate(params):
if 'mydenseblock0' in param:
params[param].initialize(ctx=ctxList[0])
elif 'mydenseblock1' in param:
params[param].initialize(ctx=ctxList[1])
elif 'mydenseblock2' in param:
params[param].initialize(ctx=ctxList[2])
print(" ", i, param, " ", params[param].list_data()[0].context)
#net.hybridize()
def square_loss(yhat, y):
return nd.mean((yhat - y) ** 2)
def custom_trainer(updaters, params, ignore_stale_grad=False):
for i, param in enumerate(params):
if params[param].grad_req == 'null':
continue
if not ignore_stale_grad:
for data in params[param].list_data():
if not data._fresh_grad:
print("`%s` on context %s has not been updated" % (params[param].name, str(data.context)))
assert False
for upd, arr, grad in zip(updaters, params[param].list_data(), params[param].list_grad()):
if not ignore_stale_grad or arr._fresh_grad:
upd(i, grad, arr)
arr._fresh_grad = False
batch_size = 100
epochs = 100
iteration = -1
opt = mx.optimizer.create('adam', learning_rate=0.001, rescale_grad=1 / batch_size)
updaters = [mx.optimizer.get_updater(opt)]
results = []
for e in range(epochs):
train_groups = np.array_split(np.arange(x.shape[0]), x.shape[0] / batch_size)
for i, idx in enumerate(train_groups):
iteration += 1
xtrain, ytrain = x[idx, :], y[idx]
xtrain = nd.array(xtrain)
xtrain = xtrain.as_in_context(ctxList[0])
ytrain = nd.array(ytrain).reshape((-1, 1))
ytrain = ytrain.as_in_context(ctxList[0])
with autograd.record():
yhat = net(xtrain)
loss = square_loss(yhat, ytrain.as_in_context(ctxList[-1]))
loss.backward()
custom_trainer(updaters, net.collect_params())
if iteration % 10 == 0:
results.append([iteration, loss.asnumpy().item()])
print("epoch= {:5,d}, iter= {:6,d}, error= {:6.3E}".format(e, iteration, loss.asnumpy().item()))
However, I got the following error:
RuntimeError: Parameter 'hybridsequential0_mydenseblock1_weight' was
not initialized on context gpu(0). It was only initialized on [gpu(1)].
terminate called recursively
terminate called after throwing an instance of 'dmlc::Error'
I had no problem using Blocks (as already confirmed in the post), it is just HybridBlock that's not working. Could someone please help? It seems to me that model parallelism examples are pretty scant.