where is the mistake in this neural network implementation?

Question

I recently started to learn deep learning and I tried to write a forward and backward propagation from the beginning but I think there is a problem with my code so I know this is hard to find the problem and you may find it silly to code from the beginning and not using frameworks but help me if you can. I tried this code on MNIST data set and the soft max out for test was 0.1 for all 10 possible answer the code to start W and b:

def p_init(dim1,dim2):
    W=np.random.randn(dim2,dim1)*0.01
    b=np.zeros((dim2,1))
    return W,b
def mp_init(L_dim):
    Lnum=len(L_dim)-1
    W={}
    b={}
    for i in range(Lnum):
        W[str(i+1)],b[str(i+1)]=p_init(L_dim[i],L_dim[i+1])
    return W,b

code for relu and its derevitive:

def relu(z):
    return np.maximum(0,z)

def drelu(z):
    return np.greater(z, 0).astype(int)

code for sigmoid and its derivative L

def sigmoid(z):
return expit(z)

def dsigmoid(z):
    s=sigmoid(z)
    return s*(1-s)

code for final layer with softmax:

def softmax(z):
    exps=np.exp(z - np.max(z))
    return exps/np.sum(exps)

code for froward :

def forward(W,b,X,L_num,activation):
    Z={}
    Z['0']=np.array([1]) 
    #print("z0")
    A={}
    A['0']=X
    #print("a0")
    for i in range (L_num):
        Z[str(i+1)]=np.dot(W[str(i+1)],A[str(i)])+b[str(i+1)]
        print('Z['+str(i+1)+']=np.dot(W['+str(i+1)+'],A['+str(i)+'])+b['+str(i+1)+']')
        if (activation[i]==0):
            A[str(i+1)]=sigmoid(Z[str(i+1)])
            print('A['+str(i+1)+']=sigmoid(Z['+str(i+1)+'])')
        elif (activation[i]==1):
            A[str(i+1)]=relu(Z[str(i+1)])
            print('A['+str(i+1)+']=relu(Z['+str(i+1)+'])')
        elif (activation[i]==2):
            A[str(i+1)]=softmax(Z[str(i+1)])
            print('A['+str(i+1)+']=softmax(Z['+str(i+1)+'])')
            
    return Z,A

code for backward propagation :(highly suspected for error):

def backward(A,Y,Z,L_num,activation,m):
    dW={}
    db={}
    dZ={}
    
    dZ[str(L_num)]=A[str(L_num)]-Y
    
    print('dZ['+str(L_num)+']=np.subtract(A['+str(L_num)+'],Y)')
    for i in range(1,L_num+1):
        dW[str(L_num-i+1)]=np.dot(dZ[str(L_num-i+1)],A[str(L_num-i)].T)/m
        print('dW['+str(L_num-i+1)+']=np.dot(dZ['+str(L_num-i+1)+'],A['+str(L_num-i)+'].T)/m')
        db[str(L_num-i+1)]=np.sum( dZ[str(L_num-i+1)],axis=1,keepdims=True)/m
        print('db['+str(L_num-i+1)+']=np.sum( dZ['+str(L_num-i+1)+'],axis=1,keepdims=True)/m')
        if (activation[L_num-(i+1)]==0):
            #print(i)
            dZ[str(L_num-i)]=np.dot(W[str(L_num-i+1)].T,dZ[str(L_num-i+1)])*dsigmoid(Z[str(L_num-i)])
            print('dZ['+str(L_num-i)+']=np.dot(W['+str(L_num-i+1)+'].T,dZ['+str(L_num-i+1)+'])*dsigmoid(Z['+str(L_num-i)+'])')
        elif (activation[L_num-(i+1)]==1):
            #print(i)
            dZ[str(L_num-i)]=np.dot(W[str(L_num-i+1)].T,dZ[str(L_num-i+1)])*drelu(Z[str(L_num-i)])
            print('dZ['+str(L_num-i)+']=np.dot(W['+str(L_num-i+1)+'].T,dZ['+str(L_num-i+1)+'])*drelu(Z['+str(L_num-i)+'])')       
        #elif(activation[L_num-(i+1)]==2):there is no need to compute derivative of softmax
            #print(i)  
            #dZ[str(L_num-i)]=np.dot(W[str(L_num-i+1)].T,dZ[str(L_num-i+1)])*dsoftmax(Z[str(L_num-i)])
    return dZ,dW,db

code for update

def update(W,b,dW,db,L_num,alpha):
    for i in range (L_num):
        W[str(i+1)]= W[str(i+1)]-(alpha*dW[str(i+1)])
        print('W['+str(i+1)+']= W['+str(i+1)+']-(alpha*dW['+str(i+1)+'])')
        b[str(i+1)]= b[str(i+1)]-(alpha*db[str(i+1)])
        print('b['+str(i+1)+']= b['+str(i+1)+']-(alpha*db['+str(i+1)+'])')
    return W,b

code for cross-entropy :

def cross_entropy(Y,A,m):
    S=Y*np.log(A)
    L=np.sum(S)/(-1*m)
    return L

code for starting W and b and inial parameters :

m=60000####
epoch=500
alpha=0.1
L_num=3
L_dim=np.array([784,500,125,10])
activation=np.array([0,1,2])

code for my oneHotEncoder:

def adjust_y(Y,final_layer):
    m=Y.shape[0]
    Ya=np.zeros((final_layer,m))
    #print(Ya.shape)
    for i in range(m):
        #print(i)
        Ya[int(Y[i])][i]=1#y[i]-1
    return Ya

code to import X and Y's:

X1=inp.convert_from_file('TrainX')
X=np.reshape(X1,(60000,784)).T
Y1=inp.convert_from_file('TrainY')

Y1=np.reshape(Y1,(60000,1))
Y=adjust_y(Y1,10)

X2=inp.convert_from_file('TestX')
X2=np.reshape(X2,(10000,784)).T
Y2=inp.convert_from_file('TestY')
Y2=np.reshape(Y2,(10000,1))

code to start training:

for i in range (epoch)  :      
        Z,A=forward(W,b,X,L_num,activation)
        #print("Cost"+str(i)+" : "+str(costf(Y,A[str(L_num)],m)))
        print("CrossEntropy "+str(i)+" : "+str(cross_entropy(Y,A[str(L_num)],m)))
        dZ,dW,db=backward(A,Y,Z,L_num,activation,m)
        W,b=update(W,b,dW,db,L_num,alpha)

score 0 · Answer 1 · answered Nov 03 '20 at 22:03

I tried to find an answer for this for 2 days but after a few hours of testing the code I found something simple that causes the problem so I post it for you : in the softmax function that I wrote softmax works in the horizontal dimension but it should do the work in the vertical dimension

def softmax(z):

exps=np.exp(z - np.max(z))
return exps/np.sum(exps,axis=0)

note that I added axis=0.

where is the mistake in this neural network implementation?

1 Answers1