1

I'm trying to mimic TimeDistributed in PyTorch just like keras TimeDistributed. please see below model

class GRULinear(nn.Module):
    def __init__(self,input_size, hidden_size, num_layers, batch_first=False):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first)
        self.fc = nn.Sequential(nn.ReLU(True),
                                nn.Linear(hidden_size, hidden_size), 
                                nn.ReLU(True))
    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out)
        return out


class CNN_GRU(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CNN_GRU, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.feature_extractor = nn.Sequential(
    
            nn.Conv2d(input_dim, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2,2)),
            
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),

            nn.Flatten()
        )

        self.gru_linear = GRULinear(16000, output_dim, 2, True)


    def forward(self, state):

        features=[]
        for i in range(state.shape[1]):
            features.append(self.feature_extractor(state[0][i]).unsqueeze(1))
        
        features = torch.flatten(torch.cat(features, axis=1), start_dim=1)
        features = torch.reshape(features, shape=(1,)+features.shape)

        outs = torch.flatten(self.gru_linear(features))
 
        return outs

TEST(using torchsummary (from torchsummary import summary)):

model = CNN_GRU(3,64).to('cuda')

summary(model, input_size=(125, 1, 3 , 16, 16))

OUTPUT SNIPPET:

================================================================
Total params: 59,758,160
Trainable params: 59,758,160
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.37
Forward/backward pass size (MB): 34.15
Params size (MB): 227.96
Estimated Total Size (MB): 262.48
----------------------------------------------------------------

FULL-OUTPUT :

https://justpaste.it/31r4v

I've tested above model using torchsymmary. it has a lot more trainable parameters than keras TimeDistributed model with the same parameters. i don't understand what I'm doing wrong. so what is proper way to mimic TimeDistributed?. if any other way possible then it would be grate to give some points.

Sky savani
  • 37
  • 3

0 Answers0