I am trying to implement Convolutional Neural Network from scratch with Python numpy. I implemented forward and backward phases with numpy einsum (functions conv_forward and conv_backward).
When I compared the results of einsum conv_forward and conv_backward to reference implementations taken from the Coursera's Convolution Neural Network course (conv_forward_ref, conv_backward_ref), it is shown that the einsum versions give slight different results as compared to the reference implementation.
It is neglectable for a small amount of loops, but the difference is significant with a larger number of loops. I was checking my implementation carefully and found no errors. I am not sure why is that, and which implementation is giving correct results. And is there any other ways to implement the functions more efficiently (without using numpy einsum)?
Here is the code:
import numpy as np
# pad data
def pad_data(img_array, pad_size, pad_val=0):
padded_array = np.pad(img_array, ((0, 0), (pad_size, pad_size), (pad_size, pad_size), (0, 0)), 'constant', constant_values=(pad_val, pad_val))
return padded_array
"""
Reference implementation: Coursera's Convolution Neural Network course
"""
# Implement a single matrix multiplication of a slice of input and weights, bias
def conv_single_step(a_slice_prev, W, b):
s = a_slice_prev * W
Z = np.sum(s)
Z = Z + b
return Z
# conv forward: source code from Coursera's Convolution Neural Network course
def conv_forward_ref(A_prev, W, b, hparameters):
# get dimension of output of previous layer
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# get dimension of this layer's filter
(f, f, n_C_prev, n_C) = W.shape
# get values of this layer's hyperparameters
stride = hparameters["stride"]
pad = hparameters["pad"]
# compute the dimensions of the CONV output volume
n_H = int((n_H_prev - f + 2*pad) / stride) + 1
n_W = int((n_W_prev - f + 2*pad) / stride) + 1
# initialize the output volume Z with zeros
Z = np.zeros((m, n_H, n_W, n_C))
# pad the output of previous layer
A_prev_pad = pad_data(A_prev, pad)
# compute Z
for i in range(m):
a_prev_pad = A_prev_pad[i]
for h in range(n_H):
for w in range(n_W):
for c in range(n_C):
# find the corners of the current slice
vert_start = h * stride
vert_end = vert_start + f
horiz_start = w * stride
horiz_end = horiz_start + f
# get the pixel values of the current slice of the previous layer's output
a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
# convolve
Z[i,h,w,c] = conv_single_step(a_slice_prev, W[:,:,:,c], b[:,:,:,c])
# make sure the output shape is correct
assert(Z.shape == (m, n_H, n_W, n_C))
return Z
# conv backward: source code from Coursera's Convolution Neural Network course
def conv_backward_ref(dZ, A_prev, W, b, hparameters):
### START CODE HERE ###
# Retrieve dimensions from A_prev's shape
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# Retrieve dimensions from W's shape
(f, f, n_C_prev, n_C) = W.shape
# Retrieve information from "hparameters"
stride = hparameters["stride"]
pad = hparameters["pad"]
# Retrieve dimensions from dZ's shape
(m, n_H, n_W, n_C) = dZ.shape
# Initialize dA_prev, dW, db with the correct shapes
dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))
dW = np.zeros((f, f, n_C_prev, n_C))
db = np.zeros((1, 1, 1, n_C))
# Pad A_prev and dA_prev
A_prev_pad = pad_data(A_prev, pad_size=pad)
dA_prev_pad = pad_data(dA_prev, pad_size=pad)
for i in range(m): # loop over the training examples
# select ith training example from A_prev_pad and dA_prev_pad
a_prev_pad = A_prev_pad[i]
da_prev_pad = dA_prev_pad[i]
for h in range(n_H): # loop over vertical axis of the output volume
for w in range(n_W): # loop over horizontal axis of the output volume
for c in range(n_C): # loop over the channels of the output volume
# Find the corners of the current "slice"
vert_start = h * stride
vert_end = vert_start + f
horiz_start = w * stride
horiz_end = horiz_start + f
# Use the corners to define the slice from a_prev_pad
a_slice = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
# Update gradients for the window and the filter's parameters using the code formulas given above
da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
db[:,:,:,c] += dZ[i, h, w, c]
# Set the ith training example's dA_prev to the unpaded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])
#print(da_prev_pad[pad:-pad, pad:-pad, :].shape)
dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]
### END CODE HERE ###
# Making sure your output shape is correct
assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))
return dA_prev, dW, db
"""
Numpy einsum implementation
"""
# conv forward: implemented with numpy einsum
def conv_forward(A_prev, W, b, hparameters):
# get dimension of output of previous layer
#print(A_prev.shape)
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# get dimension of this layer's filter
(f, f, n_C_prev_W, n_C) = W.shape
# make sure number of channels of A_prev equal to number of channels of W
assert(n_C_prev == n_C_prev_W)
# get values of this layer's hyperparameters and determine shape of output
stride = hparameters["stride"]
pad = hparameters["pad"]
n_H = int((n_H_prev - f + 2*pad) / stride) + 1
n_W = int((n_W_prev - f + 2*pad) / stride) + 1
# pad the output of previous layer
A_prev_pad = pad_data(A_prev, pad)
# compute Z for multiple input images and multiple filters
shape = (f, f, n_C_prev, m, n_H, n_W, 1)
strides = (A_prev_pad.strides * 2)[1:]
M = np.lib.stride_tricks.as_strided(A_prev_pad, shape=shape, strides=strides, writeable=False)
Z = np.einsum('pqrs,pqrtbmn->tbms', W, M)
Z = Z + b
assert(Z.shape == (m, n_H, n_W, n_C))
return Z
# # conv backward: implemented with numpy einsum
def conv_backward(dZ, A_prev, W, b, hparameters):
# retrieve dimensions from A_prev's shape
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# Retrieve dimensions from W's shape
(f, f, n_C_prev, n_C) = W.shape
# retrieve information from "hparameters"
stride = hparameters["stride"]
pad = hparameters["pad"]
# retrieve dimensions from dZ's shape
(m, n_H, n_W, n_C) = dZ.shape
# compute pad values to be applied to dZ, to guarantee A_prev's dimensions
pad_H = int(((n_H_prev - 1) * stride - n_H + f) / 2)
pad_W = int(((n_W_prev - 1) * stride - n_W + f) / 2)
# make sure pad_H equal pad_W cause this implementation support equal padding only
assert(pad_H == pad_W)
pad_dZ = pad_H
## compute dA_prev: inverse of forward process
# step 1: rotate W 180 degrees
# step 1: pad dZ then as_strided dZ to fxfxn_C
# step 2: dot strided dZ and 180-rotated W
# rotate W 180 degrees (= rotate 90 degrees twice) around the two first dims, anti-clockwise direction
W = np.rot90(W, 2)
# pad dZ
dZ_pad = pad_data(dZ, pad_dZ)
# compute dA_prev with strided trick and numpy einsum
shape = (f, f, n_C, m, n_H_prev, n_W_prev)
strides = (dZ_pad.strides)[1:] + (dZ_pad.strides)[0:3]
M = np.lib.stride_tricks.as_strided(dZ_pad, shape=shape, strides=strides, writeable=False)
dA_prev = np.einsum('pqrs,pqsbmn->bmnr', W, M)
assert(dA_prev.shape == A_prev.shape)
# free memory
del dZ_pad
## compute dW and db
# compute dW
A_prev_pad = pad_data(A_prev, pad)
shape_Z = (f, f, n_C_prev, m, n_H, n_W)
strides_Z = (A_prev_pad.strides)[1:] + (A_prev_pad.strides)[0:3]
M = np.lib.stride_tricks.as_strided(A_prev_pad, shape=shape_Z, strides=strides_Z, writeable=False)
dW = np.einsum('abcd,pqsabc->pqsd', dZ, M)
assert(dW.shape == W.shape)
db = np.einsum('abcd->d', dZ).reshape(1, 1, 1, n_C)
return dA_prev, dW, db
## compute dW and db
"""
Test
"""
A_prev = np.random.rand(10, 100, 100, 3) * 1000
W = np.random.rand(5, 5, 3, 10)
b = np.zeros((1, 1, 1, 10))
hparameters = {"stride": 1, "pad": 2}
Z_ref = conv_forward_ref(A_prev, W, b, hparameters)
Z = conv_forward(A_prev, W, b, hparameters)
print("sum of difference for Z: ", np.sum(Z_ref - Z))
print("is Z matched with Z_slow: ", np.allclose(Z_ref, Z))
dZ = np.random.rand(10, 100, 100, 10) * 1000
dA_prev_ref, dW_ref, db_ref = conv_backward_ref(dZ, A_prev, W, b, hparameters)
dA_prev, dW, db = conv_backward(dZ, A_prev, W, b, hparameters)
print("sum of difference for dA: ", np.sum(dA_prev_ref - dA_prev))
print("sum of difference for dW: ", np.sum(dW_ref - dW))
print("sum of difference for db: ", np.sum(db_ref - db))
print(np.allclose(dA_prev_ref, dA_prev))
print(np.allclose(dW_ref, dW))
print(np.allclose(db_ref, db))
Results:
sum of difference for Z: -4.743924364447594e-08
is Z matched with Z_ref: True
sum of difference for dA: 3.2011885195970535e-06
sum of difference for dW: 0.0
sum of difference for db: 0.0
is dA_prev matched with dA_prev_ref: True
is dW matched with dW_ref: True
is db matched with db_ref: True