XOR with ReLU activation function

Question

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 10 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
print(Wo)

alpha = 0.05 # Learning rate

t_ = []

loss_ = []

def ReLU(x):
    return np.maximum(0,x)

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,3000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        Z_2 = np.dot(Wo,ReLU(Z_1))
        y = sigmoid(Z_2)
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                if Z_1[j] >= 0:
                    dWo[i,j] = dWo[i,j] + (y[i]-d)*Z_1[j]
                    #dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                else:
                    dWo[i,j] += 0

        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    if Z_1[j] >= 0:
                        dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*(y[i]-d)
                        #dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d)              
                    else:
                        dWs[j,k] += 0
                        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)

    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,ReLU(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()       
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,ReLU(Z_1)))
    
    print(y)

If I try this with sigmoid function, it works fine but when the ReLU activation function is implemented, the the program doesn't learning anything.

The NN consist of 3 input, hidden, output layers and sigmoid activation fuction is implemented for output function. Hand calculation seems fine but can't find the flaw.

The code below with sigmoid activation function works just fine.

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 5 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
#print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
#print(Wo)

alpha = 0.1 # Learning rate

t_ = []

loss_ = []

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,5000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        
        A_1 = sigmoid(Z_1)
        
        Z_2 = np.dot(Wo,A_1)

        y = sigmoid(Z_2)
        
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                
        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d) 
        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)
        
    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,sigmoid(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()
        
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,sigmoid(Z_1)))
    
    print(y)

score 0 · Answer 1 · answered Jan 25 '22 at 04:55

I found similar case in Quora. And have tested it in my networks that involves modelling logics to resolve some noisy cost function.

I found that ReLu outputs are usually blasted all over, by the 3rd layer of MLP, the values before the output have accumulated to thousands if not millions. And with that, I prefer sigmoid with MLPs. Don't forget, sigmoid limits output to 1, but ReLu does not.

The intuition behind ReLu is that it filters out unneeded info by means of MAX(0,X) function, before forwarded to the next layer of processing. For the same reason you see it being used in Convolution problems. Note: Normalization Layer is used in these cases so that the output values of the nodes will not blast all over.

But in the case of an MLP, you didn't implement any Norm Layer after ReLu, for that reason, it is difficult to model a simple function such as XOR. In short, without Norm Layer, I don't recommend the use of ReLu, although in some cases, it still can function properly.

XOR with ReLU activation function

1 Answers1