CNN forward and backward with numpy einsum give different results to for loop implementation

Question

I am trying to implement Convolutional Neural Network from scratch with Python numpy. I implemented forward and backward phases with numpy einsum (functions conv_forward and conv_backward).

When I compared the results of einsum conv_forward and conv_backward to reference implementations taken from the Coursera's Convolution Neural Network course (conv_forward_ref, conv_backward_ref), it is shown that the einsum versions give slight different results as compared to the reference implementation.

It is neglectable for a small amount of loops, but the difference is significant with a larger number of loops. I was checking my implementation carefully and found no errors. I am not sure why is that, and which implementation is giving correct results. And is there any other ways to implement the functions more efficiently (without using numpy einsum)?

Here is the code:

    import numpy as np

    # pad data
    def pad_data(img_array, pad_size, pad_val=0):
        padded_array = np.pad(img_array, ((0, 0), (pad_size, pad_size), (pad_size, pad_size), (0, 0)), 'constant', constant_values=(pad_val, pad_val))

        return padded_array

    """
    Reference implementation: Coursera's Convolution Neural Network course
    """
    # Implement a single matrix multiplication of a slice of input and weights, bias
    def conv_single_step(a_slice_prev, W, b):
        s = a_slice_prev * W
        Z = np.sum(s)
        Z = Z + b

        return Z

    # conv forward: source code from Coursera's Convolution Neural Network course
    def conv_forward_ref(A_prev, W, b, hparameters):
        # get dimension of output of previous layer
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
        # get dimension of this layer's filter
        (f, f, n_C_prev, n_C) = W.shape
        # get values of this layer's hyperparameters
        stride = hparameters["stride"]
        pad = hparameters["pad"]

        # compute the dimensions of the CONV output volume 
        n_H = int((n_H_prev - f + 2*pad) / stride) + 1
        n_W = int((n_W_prev - f + 2*pad) / stride) + 1

        # initialize the output volume Z with zeros
        Z = np.zeros((m, n_H, n_W, n_C))

        # pad the output of previous layer
        A_prev_pad = pad_data(A_prev, pad)

        # compute Z
        for i in range(m):
            a_prev_pad = A_prev_pad[i]
            for h in range(n_H):
                for w in range(n_W):
                    for c in range(n_C):

                        # find the corners of the current slice
                        vert_start = h * stride
                        vert_end = vert_start + f
                        horiz_start = w * stride
                        horiz_end = horiz_start + f

                        # get the pixel values of the current slice of the previous layer's output
                        a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]

                        # convolve 
                        Z[i,h,w,c] = conv_single_step(a_slice_prev, W[:,:,:,c], b[:,:,:,c])

        # make sure the output shape is correct
        assert(Z.shape == (m, n_H, n_W, n_C))

        return Z

    # conv backward: source code from Coursera's Convolution Neural Network course
    def conv_backward_ref(dZ, A_prev, W, b, hparameters):
        ### START CODE HERE ###

        # Retrieve dimensions from A_prev's shape
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

        # Retrieve dimensions from W's shape
        (f, f, n_C_prev, n_C) = W.shape

        # Retrieve information from "hparameters"
        stride = hparameters["stride"]
        pad = hparameters["pad"]

        # Retrieve dimensions from dZ's shape
        (m, n_H, n_W, n_C) = dZ.shape

        # Initialize dA_prev, dW, db with the correct shapes
        dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
        dW = np.zeros((f, f, n_C_prev, n_C))
        db = np.zeros((1, 1, 1, n_C))

        # Pad A_prev and dA_prev
        A_prev_pad = pad_data(A_prev, pad_size=pad)
        dA_prev_pad = pad_data(dA_prev, pad_size=pad)

        for i in range(m):                       # loop over the training examples

            # select ith training example from A_prev_pad and dA_prev_pad
            a_prev_pad = A_prev_pad[i]
            da_prev_pad = dA_prev_pad[i]

            for h in range(n_H):                   # loop over vertical axis of the output volume
                for w in range(n_W):               # loop over horizontal axis of the output volume
                    for c in range(n_C):           # loop over the channels of the output volume

                        # Find the corners of the current "slice"
                        vert_start = h * stride
                        vert_end = vert_start + f
                        horiz_start = w * stride
                        horiz_end = horiz_start + f

                        # Use the corners to define the slice from a_prev_pad
                        a_slice = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]

                        # Update gradients for the window and the filter's parameters using the code formulas given above
                        da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                        dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                        db[:,:,:,c] += dZ[i, h, w, c]

            # Set the ith training example's dA_prev to the unpaded da_prev_pad (Hint: use X[pad:-pad, pad:-pad, :])
            #print(da_prev_pad[pad:-pad, pad:-pad, :].shape)
            dA_prev[i, :, :, :] = da_prev_pad[pad:-pad, pad:-pad, :]
            ### END CODE HERE ###

        # Making sure your output shape is correct
        assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))

        return dA_prev, dW, db

    """
    Numpy einsum implementation
    """
    # conv forward: implemented with numpy einsum
    def conv_forward(A_prev, W, b, hparameters):
        # get dimension of output of previous layer
        #print(A_prev.shape)
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
        # get dimension of this layer's filter
        (f, f, n_C_prev_W, n_C) = W.shape

        # make sure number of channels of A_prev equal to number of channels of W
        assert(n_C_prev == n_C_prev_W)

        # get values of this layer's hyperparameters and determine shape of output
        stride = hparameters["stride"]
        pad = hparameters["pad"]

        n_H = int((n_H_prev - f + 2*pad) / stride) + 1
        n_W = int((n_W_prev - f + 2*pad) / stride) + 1

        # pad the output of previous layer
        A_prev_pad = pad_data(A_prev, pad)

        # compute Z for multiple input images and multiple filters
        shape = (f, f, n_C_prev, m, n_H, n_W, 1)
        strides = (A_prev_pad.strides * 2)[1:]

        M = np.lib.stride_tricks.as_strided(A_prev_pad, shape=shape, strides=strides, writeable=False)
        Z = np.einsum('pqrs,pqrtbmn->tbms', W, M)   
        Z = Z + b

        assert(Z.shape == (m, n_H, n_W, n_C))

        return Z

    # # conv backward: implemented with numpy einsum
    def conv_backward(dZ, A_prev, W, b, hparameters):
        # retrieve dimensions from A_prev's shape
        (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

        # Retrieve dimensions from W's shape
        (f, f, n_C_prev, n_C) = W.shape

        # retrieve information from "hparameters"
        stride = hparameters["stride"]
        pad = hparameters["pad"]

        # retrieve dimensions from dZ's shape
        (m, n_H, n_W, n_C) = dZ.shape

        # compute pad values to be applied to dZ, to guarantee A_prev's dimensions
        pad_H = int(((n_H_prev - 1) * stride - n_H + f) / 2)
        pad_W = int(((n_W_prev - 1) * stride - n_W + f) / 2)

        # make sure pad_H equal pad_W cause this implementation support equal padding only
        assert(pad_H == pad_W)
        pad_dZ = pad_H

        ## compute dA_prev: inverse of forward process
        # step 1: rotate W 180 degrees 
        # step 1: pad dZ then as_strided dZ to fxfxn_C 
        # step 2: dot strided dZ and 180-rotated W

        # rotate W 180 degrees (= rotate 90 degrees twice) around the two first dims, anti-clockwise direction
        W = np.rot90(W, 2)

        # pad dZ
        dZ_pad = pad_data(dZ, pad_dZ)

        # compute dA_prev with strided trick and numpy einsum
        shape = (f, f, n_C, m, n_H_prev, n_W_prev)
        strides = (dZ_pad.strides)[1:] +  (dZ_pad.strides)[0:3]

        M = np.lib.stride_tricks.as_strided(dZ_pad, shape=shape, strides=strides, writeable=False)
        dA_prev = np.einsum('pqrs,pqsbmn->bmnr', W, M)
        assert(dA_prev.shape == A_prev.shape)

        # free memory
        del dZ_pad

        ## compute dW and db
        # compute dW
        A_prev_pad = pad_data(A_prev, pad)
        shape_Z = (f, f, n_C_prev, m, n_H, n_W)
        strides_Z = (A_prev_pad.strides)[1:] + (A_prev_pad.strides)[0:3]

        M = np.lib.stride_tricks.as_strided(A_prev_pad, shape=shape_Z, strides=strides_Z, writeable=False)
        dW = np.einsum('abcd,pqsabc->pqsd', dZ, M) 
        assert(dW.shape == W.shape)

        db = np.einsum('abcd->d', dZ).reshape(1, 1, 1, n_C)

        return dA_prev, dW, db
        ## compute dW and db


    """
    Test
    """
    A_prev = np.random.rand(10, 100, 100, 3) * 1000
    W = np.random.rand(5, 5, 3, 10)
    b = np.zeros((1, 1, 1, 10))
    hparameters = {"stride": 1, "pad": 2}

    Z_ref = conv_forward_ref(A_prev, W, b, hparameters)
    Z = conv_forward(A_prev, W, b, hparameters)

    print("sum of difference for Z: ", np.sum(Z_ref - Z))
    print("is Z matched with Z_slow: ", np.allclose(Z_ref, Z))

    dZ = np.random.rand(10, 100, 100, 10) * 1000
    dA_prev_ref, dW_ref, db_ref = conv_backward_ref(dZ, A_prev, W, b, hparameters)
    dA_prev, dW, db = conv_backward(dZ, A_prev, W, b, hparameters)

    print("sum of difference for dA: ", np.sum(dA_prev_ref - dA_prev))
    print("sum of difference for dW: ", np.sum(dW_ref - dW))
    print("sum of difference for db: ", np.sum(db_ref - db))

    print(np.allclose(dA_prev_ref, dA_prev))
    print(np.allclose(dW_ref, dW))
    print(np.allclose(db_ref, db))

Results:

sum of difference for Z:  -4.743924364447594e-08
is Z matched with Z_ref:  True
sum of difference for dA:  3.2011885195970535e-06
sum of difference for dW:  0.0
sum of difference for db:  0.0
is dA_prev matched with dA_prev_ref:  True
is dW matched with dW_ref:  True
is db matched with db_ref:  True

CNN forward and backward with numpy einsum give different results to for loop implementation

0 Answers0