I'm trying to use MXNet's gradient descent optimizers to minimize a function. The equivalent example in Tensorflow would be:
import tensorflow as tf
x = tf.Variable(2, name='x', dtype=tf.float32)
log_x = tf.log(x)
log_x_squared = tf.square(log_x)
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(log_x_squared)
init = tf.initialize_all_variables()
def optimize():
with tf.Session() as session:
session.run(init)
print("starting at", "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))
for step in range(10):
session.run(train)
print("step", step, "x:", session.run(x), "log(x)^2:", session.run(log_x_squared))
I am not sure how to accomplish the same in MXNet. The optimizer API documentation does not appear to have an equivalent method. Here's what I've tried so far. The main confusion has been around the need to pass training data:
import mxnet as mx
x = mx.sym.Variable('data')
log_x = mx.sym.log(x)
log_x_squared = mx.sym.square(log_x)
mod = mx.mod.Module(log_x_squared) # Create a module where the loss function
# is the one we want to optimize
mod.bind(data_shapes=[('data', (1,1))]) # ?? not sure if this is correct - we
# are saying our input is a scalar
mod.init_params()
mod.init_optimizer() # SGD is default
mod.fit() # ?? must pass data_iter to fit
It seems like the x
variable should be somehow fed back in as the data_iter
but I don't know how to accomplish this.
Update: thanks to kevinthesun for their excellent answer! Here is a working minimization routine built on top of a single hidden-layer neural net:
import mxnet as mx
import numpy as np
def minimize(objective_function,
initial_params,
max_iters=1000,
optimizer='sgd',
optimizer_params=(('learning_rate', 0.1),),
tol=1e-8):
class InitialParam(mx.init.Initializer):
def __init__(self, vals):
super(InitialParam, self).__init__()
self._vals = vals
def _init_weight(self, _, arr):
arr[:] = self._vals.asnumpy()[:, np.newaxis]
x = mx.sym.Variable('data')
params_len = initial_params.shape[0]
fc = mx.sym.FullyConnected(data=x, name='fc1',
num_hidden=params_len,
no_bias=True)
# Passing the FullyConnected layer into the objective function
# is difficult to manipulate. If the fully connected layer represents
# [x, y] for optimizing a 2 dimensional function f(x, y) it is easier
# to work with x, and y. So we split the fully connected layer into a
# number of symbols for each parameter:
param_syms = []
for i in range(params_len):
ps = mx.sym.slice(fc, begin=(0, i), end=(1, i + 1))
param_syms.append(ps)
# The loss function for the network is our objective function.
loss = mx.sym.MakeLoss(objective_function(param_syms))
mod = mx.mod.Module(loss)
mod.bind(data_shapes=[('data', (1,))])
mod.init_params(InitialParam(initial_params))
mod.init_optimizer(optimizer=optimizer,
optimizer_params=optimizer_params)
(o_name, o_shape), = mod.output_shapes
i = 0
params = initial_params
old_val = np.full(o_shape, np.nan)
while i < max_iters:
mod.forward_backward(mx.io.DataBatch(
data=[mx.nd.ones((1,))]))
mod.update()
params = mod.get_params()[0]['fc1_weight']
val = mod.get_outputs()[0].asnumpy()
if np.allclose(old_val, val, atol=tol):
print 'Function value: {}'.format(val)
print 'Iterations: {}'.format(i)
return params
old_val = val
i += 1
return params
and using it:
def my_func(x):
return (x[0] + 1) ** 2
p = minimize(my_func, mx.nd.array([1.0]))
p.asnumpy()
>>> array([[-0.99999988]], dtype=float32)
and another:
def my_func(x):
return (x[0] + 1) ** 2 + (x[1] - 2) ** 2 + (x[2] + 3) ** 2
p = minimize(my_func, mx.nd.array([1.0, 1.5, 2.0]))
p.asnumpy()
>>> array([[-0.99996436],
[ 1.99999106],
[-2.99991083]], dtype=float32)