What is proper way to mimic keras timedistributed layer in pytorch?

Question

I'm trying to mimic TimeDistributed in PyTorch just like keras TimeDistributed. please see below model

class GRULinear(nn.Module):
    def __init__(self,input_size, hidden_size, num_layers, batch_first=False):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first)
        self.fc = nn.Sequential(nn.ReLU(True),
                                nn.Linear(hidden_size, hidden_size), 
                                nn.ReLU(True))
    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out)
        return out


class CNN_GRU(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CNN_GRU, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.feature_extractor = nn.Sequential(
    
            nn.Conv2d(input_dim, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2,2)),
            
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Flatten()
        )

        self.gru_linear = GRULinear(16000, output_dim, 2, True)


    def forward(self, state):

        features=[]
        for i in range(state.shape[1]):
            features.append(self.feature_extractor(state[0][i]).unsqueeze(1))
        
        features = torch.flatten(torch.cat(features, axis=1), start_dim=1)
        features = torch.reshape(features, shape=(1,)+features.shape)

        outs = torch.flatten(self.gru_linear(features))
 
        return outs

TEST(using torchsummary (from torchsummary import summary)):

model = CNN_GRU(3,64).to('cuda')

summary(model, input_size=(125, 1, 3 , 16, 16))

OUTPUT SNIPPET:

================================================================
Total params: 59,758,160
Trainable params: 59,758,160
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.37
Forward/backward pass size (MB): 34.15
Params size (MB): 227.96
Estimated Total Size (MB): 262.48
----------------------------------------------------------------

FULL-OUTPUT :

https://justpaste.it/31r4v

I've tested above model using torchsymmary. it has a lot more trainable parameters than keras TimeDistributed model with the same parameters. i don't understand what I'm doing wrong. so what is proper way to mimic TimeDistributed?. if any other way possible then it would be grate to give some points.

What is proper way to mimic keras timedistributed layer in pytorch?

0 Answers0