Tensorflow: Custom training loop using GPU slower than CPU

Question

I am doing this custom training of a neural network in Colab, with and without GPU, and the training process is faster using the CPU, which makes me think that I am not parallelising the operations or missing something. I do not think it is because of the model is small, because I tried more complicated models and the problem persists:

## Import libraries
import matplotlib
# matplotlib.use('TkAgg') # Required to make it run on both Windows and Mac
import matplotlib.pyplot as plt
import tensorflow as tf 
from tensorflow import keras
import numpy as np 
import os
from tqdm import trange

# Switch of unnecessary TF warning messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

###############################################################################
################################## Parameters #################################
###############################################################################

gamma = tf.constant(2.0)           # Curvature of the utility function
rho   = tf.constant(0.04)          # Discount rate
A      = tf.constant(0.5)          # TFP
alpha = tf.constant(0.36)          # Returns to scale
delta = tf.constant(0.05)          # Depreciation Rate of Capital

batchSize = 100                    # Batch Size
number_epochs = 100000             # Number of epochs

kMin = 0.1                         # lower bound of sample interval
kMax = 10.0                        # upper bound of sample interval

gridSize = 10000                  # Plotting grid

# Set global seed
tf.random.set_seed(1234)
np.random.seed(1234)

# Value function initial guess
initGuess = -60

# Neural network optimizer
optimizer = keras.optimizers.Adam()

###############################################################################
######################## Value Function Neural Network ########################
###############################################################################

def valueFnNeuralNet(nHidden = 3, nNeurons = 8):
    model = keras.models.Sequential()

    # Input layer
    model.add(keras.layers.Dense(nNeurons, activation = "tanh", input_dim = 1))

    # Hiden layers
    for layer in range(nHidden - 1):
        model.add(keras.layers.Dense(nNeurons, activation = "tanh"))

    # Output layer
    model.add(keras.layers.Dense(1,bias_initializer = keras.initializers.Constant(value = initGuess)))
    return model

def HJB(input, V):
    VPrime = tf.gradients(V(input), input)[0]
    VPrimemax = tf.maximum(VPrime, 1E-7)        # dV/dk

    Y = A * tf.pow(input, alpha)                # Output

    C = tf.pow(VPrimemax, (-1/gamma))           # Consumption

    I = Y - C                                   # Investment

    muK = I - delta * input                     # Capital drift

    U = tf.pow(C, 1-gamma) / (1-gamma)          # Utility

    HJB = U - rho * V(input) + tf.multiply(tf.stop_gradient(VPrimemax), muK)
    return HJB 

def Objective(batchSize):
    input = tf.random.uniform(shape = (batchSize,1), minval = kMin, maxval = kMax)
    error = HJB(input, VF)
    return tf.reduce_mean(tf.square(error))

###############################################################################
################################ Training Step ################################
###############################################################################

# Need decorator to run in graph mode instead of eager exectution
@tf.function
def training_step():
    with tf.GradientTape() as tape:
        loss = Objective(batchSize)
    grads = tape.gradient(loss, theta)
    optimizer.apply_gradients(zip(grads, theta))
    return loss

###############################################################################
################################ Training Loop ################################
###############################################################################

def train_model(epochs):
    losses = []
    for epoch in trange(epochs):
        loss = training_step()
        losses.append(loss.numpy())
    return losses 

###############################################################################
################################### Running ###################################
###############################################################################

# Set up neural network
VF = valueFnNeuralNet()

# Define trainable network parameters
theta = VF.trainable_variables

# Run Model (and output loss evolution) 
results = train_model(number_epochs)

The outputs that I get are the following:

withouth GPU: 100%|██████████| 100000/100000 [01:30<00:00, 1101.79it/s]

with GPU: 100%|██████████| 100000/100000 [03:36<00:00, 461.47it/s]

score 0 · Answer 1 · answered Sep 02 '21 at 14:26

GPUs are more efficient for large matrix multiplications. Your input is of shape (100, 1) and so the distributed advantages of the GPU is so little it doesn't even offset the overhead of switching between CPU and GPU.

My guess is that you'll see the pattern reverse if you have input of shape (100, 100) instead.

Tensorflow: Custom training loop using GPU slower than CPU

1 Answers1