Why does the default GRU implementation in pytorch and keras differ significantly?

Question

After implementing my own GRU cell, I was trying to validate it with the default implementation available on pytorch and keras. My implementation was very close to pytorch but significantly different from keras. So, I first decided to compare the implementations available in pytorch and keras against each other and found that they both were significantly different. Here is some code:

import numpy as np
import torch as tt
import torch.nn as nn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# data generation
input_size = 2
seq_len = 4
hidden_size = 3
batch_size=1

rng = np.random.default_rng(10)

xx = rng.uniform(size=(batch_size, seq_len, input_size)).astype(np.float32)
print(xx.shape)

output:

(1, 4, 2)

First created a keras gru layer and do one forward pass to get the output

gru_keras = layers.GRU(hidden_size, return_sequences=True)
out_gru_keras = gru_keras(xx)
out_gru_keras

output:

<tf.Tensor: shape=(1, 4, 3), dtype=float32, numpy=
array([[[0.05563376, 0.22639018, 0.14037813],
        [0.05021746, 0.27509487, 0.17843583],
        [0.0273784 , 0.23740683, 0.1731183 ],
        [0.10505023, 0.38027072, 0.31583947]]], dtype=float32)>

Then check the weights in keras gru

gru_keras_weights = gru_keras.get_weights()
print(f'{len(gru_keras_weights)=}')
for w in gru_keras_weights:
    print(w.shape, w.dtype, w)

output:

len(gru_keras_weights)=3
(2, 9) float32 [[ 0.38249677 -0.67729133 -0.28855678  0.3081903  -0.530349    0.1531434
   0.09444886  0.2978403   0.1516701 ]
 [ 0.0833146   0.27516943  0.4720915  -0.7370237  -0.20921749  0.38180763
   0.23018956  0.39872426  0.5722596 ]]
(3, 9) float32 [[-0.21449113  0.2944518  -0.25759113  0.2292317   0.35174483  0.42021522
  -0.5116475   0.42759803 -0.05884372]
 [-0.3477073  -0.15120703  0.7333025   0.418491   -0.07980055 -0.21007833
  -0.1924745   0.20504965  0.11737184]
 [ 0.01270523  0.15124948  0.4014033  -0.568793    0.4513449  -0.03860948
  -0.39513308 -0.36090007 -0.02702253]]
(2, 9) float32 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Now, create a pytorch gru and check its weights

gru_torch = nn.GRU(
    input_size=input_size,
    hidden_size=hidden_size,
    bias=True,
    batch_first=True,
    num_layers=1,
    dropout=0.0,
    bidirectional=False,
    dtype=tt.float32
)
sd_gru_torch = gru_torch.state_dict()
for k,v in sd_gru_torch.items():
    print(f'{k}, {v.shape}')

output:

weight_ih_l0, torch.Size([9, 2])
weight_hh_l0, torch.Size([9, 3])
bias_ih_l0, torch.Size([9])
bias_hh_l0, torch.Size([9])

Clearly, pytorch implements two separate bias bias_ih_l0 and bias_hh_l0 while keras has a concatenated bias (which is zero). Now I copied the weights accordingly.

with tt.no_grad():
    gru_torch.get_parameter('weight_ih_l0').copy_(tt.tensor(gru_keras_weights[0].T))
    gru_torch.get_parameter('weight_hh_l0').copy_(tt.tensor(gru_keras_weights[1].T))
    gru_torch.get_parameter('bias_ih_l0').copy_(tt.tensor(gru_keras_weights[2].T[:,0]))
    gru_torch.get_parameter('bias_hh_l0').copy_(tt.tensor(gru_keras_weights[2].T[:,1]))

... and check the parameters again

with tt.no_grad():
    for p in gru_torch.parameters():
        print(p.shape, p)

output:

torch.Size([9, 2]) Parameter containing:
tensor([[ 0.3825,  0.0833],
        [-0.6773,  0.2752],
        [-0.2886,  0.4721],
        [ 0.3082, -0.7370],
        [-0.5303, -0.2092],
        [ 0.1531,  0.3818],
        [ 0.0944,  0.2302],
        [ 0.2978,  0.3987],
        [ 0.1517,  0.5723]], requires_grad=True)
torch.Size([9, 3]) Parameter containing:
tensor([[-0.2145, -0.3477,  0.0127],
        [ 0.2945, -0.1512,  0.1512],
        [-0.2576,  0.7333,  0.4014],
        [ 0.2292,  0.4185, -0.5688],
        [ 0.3517, -0.0798,  0.4513],
        [ 0.4202, -0.2101, -0.0386],
        [-0.5116, -0.1925, -0.3951],
        [ 0.4276,  0.2050, -0.3609],
        [-0.0588,  0.1174, -0.0270]], requires_grad=True)
torch.Size([9]) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
torch.Size([9]) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

Now the weights are matching exactly. Taking a forward pass through torch with the same data

with tt.no_grad():
    out_gru_torch, _ = gru_torch(tt.tensor(xx))
print (out_gru_torch.shape, out_gru_torch)

output:

torch.Size([1, 4, 3]) tensor([[[0.0638, 0.2232, 0.1145],
         [0.0553, 0.2741, 0.1618],
         [0.0306, 0.2399, 0.1644],
         [0.1231, 0.3970, 0.3153]]])

But it can be seen that the outputs are different in value

out_gru_torch.numpy() - out_gru_keras.numpy()

output:

array([[[ 0.00813178, -0.00323714, -0.02592391],
        [ 0.0050362 , -0.000976  , -0.01662555],
        [ 0.00321813,  0.0024882 , -0.00869839],
        [ 0.0180769 ,  0.01671427, -0.00050169]]], dtype=float32)

summing up the differences

np.sum(np.abs(out_gru_torch.numpy() - out_gru_keras.numpy()))

output:

0.10962818

Why is there such a significant difference?

I also tried the same thing with LSTMs but found that the difference was insignificant. This can be attributed to floating point rounding errors maybe. Here is some code.

Make a keras LSTM

lstm_keras = layers.LSTM(hidden_size, return_sequences=True)
out_lstm_keras = lstm_keras(xx)
out_lstm_keras

output:

<tf.Tensor: shape=(1, 4, 3), dtype=float32, numpy=
array([[[-0.12176377, -0.07746243, -0.08807365],
        [-0.17760691, -0.11547467, -0.12406464],
        [-0.16980645, -0.1159803 , -0.12289675],
        [-0.16330168, -0.08463871, -0.18625976]]], dtype=float32)>

check keras LSTM weights

lstm_keras_weights = lstm_keras.get_weights()
print(f'{len(lstm_keras_weights)=}')
for w in lstm_keras_weights:
    print(w.shape, w.dtype, w)

output:

len(lstm_keras_weights)=3
(2, 12) float32 [[-0.18769234  0.6526979   0.27196562 -0.23817068 -0.05964065 -0.11090988
  -0.6442989  -0.4168117  -0.344454    0.12466687 -0.6536666  -0.28540143]
 [-0.35713544  0.34027737  0.09951967 -0.21514818  0.47551024  0.305395
   0.16330504  0.22410381 -0.13371867  0.21646535  0.01366949  0.4818431 ]]
(3, 12) float32 [[-0.22128499 -0.17296375  0.03671373  0.16226508  0.19011612 -0.41836154
  -0.5816412  -0.32847112 -0.31468534 -0.27402246  0.05426207  0.24291728]
 [ 0.4650536  -0.57491106  0.01105271 -0.0380749   0.2271702   0.39930764
   0.11620218 -0.19071549 -0.30224687 -0.13937864 -0.27111995  0.08010413]
 [ 0.45147026  0.3288076  -0.37750363  0.35117835 -0.31541684 -0.1335725
  -0.0910389  -0.24736843  0.03350063 -0.27691114 -0.28898126 -0.27222085]]
(12,) float32 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]

Now create a pytorch LSTM and check its weights

lstm_torch = nn.LSTM(
    input_size=input_size,
    hidden_size=hidden_size,
    bias=True,
    batch_first=True,
    num_layers=1,
    dropout=0.0,
    bidirectional=False,
    dtype=tt.float32
)
sd_lstm_torch = lstm_torch.state_dict()
for k,v in sd_lstm_torch.items():
    print(f'{k}, {v.shape}')

output:

weight_ih_l0, torch.Size([12, 2])
weight_hh_l0, torch.Size([12, 3])
bias_ih_l0, torch.Size([12])
bias_hh_l0, torch.Size([12])

It can be noted that in keras LSTM, only one bias is implemented (not a stack of 2 bias as done in GRU), so I choose one of the bias in pytorch LSTM and copied keras bias vector to it. It doesn't make much difference on which bias is copied to, the other bias is made zero. Here I choose bias_ih_l0 to copy to and bias_hh_l0 is made zero.

with tt.no_grad():
    lstm_torch.get_parameter('weight_ih_l0').copy_(tt.tensor(lstm_keras_weights[0].T))
    lstm_torch.get_parameter('weight_hh_l0').copy_(tt.tensor(lstm_keras_weights[1].T))
    lstm_torch.get_parameter('bias_ih_l0').copy_(tt.tensor(lstm_keras_weights[2]))
    lstm_torch.get_parameter('bias_hh_l0').copy_(tt.zeros(lstm_keras_weights[2].shape))

Checking the parameters again

with tt.no_grad():
    for p in lstm_torch.parameters():
        print(p.shape, p)

output:

torch.Size([12, 2]) Parameter containing:
tensor([[-0.1877, -0.3571],
        [ 0.6527,  0.3403],
        [ 0.2720,  0.0995],
        [-0.2382, -0.2151],
        [-0.0596,  0.4755],
        [-0.1109,  0.3054],
        [-0.6443,  0.1633],
        [-0.4168,  0.2241],
        [-0.3445, -0.1337],
        [ 0.1247,  0.2165],
        [-0.6537,  0.0137],
        [-0.2854,  0.4818]], requires_grad=True)
torch.Size([12, 3]) Parameter containing:
tensor([[-0.2213,  0.4651,  0.4515],
        [-0.1730, -0.5749,  0.3288],
        [ 0.0367,  0.0111, -0.3775],
        [ 0.1623, -0.0381,  0.3512],
        [ 0.1901,  0.2272, -0.3154],
        [-0.4184,  0.3993, -0.1336],
        [-0.5816,  0.1162, -0.0910],
        [-0.3285, -0.1907, -0.2474],
        [-0.3147, -0.3022,  0.0335],
        [-0.2740, -0.1394, -0.2769],
        [ 0.0543, -0.2711, -0.2890],
        [ 0.2429,  0.0801, -0.2722]], requires_grad=True)
torch.Size([12]) Parameter containing:
tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.], requires_grad=True)
torch.Size([12]) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

Now do a forward pass through pytorch LSTM with the same data

with tt.no_grad():
    out_lstm_torch, _ = lstm_torch(tt.tensor(xx))
print (out_lstm_torch.shape, out_lstm_torch)

output:

torch.Size([1, 4, 3]) tensor([[[-0.1218, -0.0775, -0.0881],
         [-0.1776, -0.1155, -0.1241],
         [-0.1698, -0.1160, -0.1229],
         [-0.1633, -0.0846, -0.1863]]])

Taking absolute difference

np.sum(np.abs(out_lstm_torch.numpy() - out_lstm_keras.numpy()))

output:

2.7567148e-07

The difference is significantly less. I also found that the initial hidden states are assumed to be all zeros by both pytorch and keras so I don't need to set it manually.

Why does the default GRU implementation in pytorch and keras differ significantly?

0 Answers0