I'm trying to mimic TimeDistributed in PyTorch just like keras TimeDistributed. please see below model
class GRULinear(nn.Module):
def __init__(self,input_size, hidden_size, num_layers, batch_first=False):
super().__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first)
self.fc = nn.Sequential(nn.ReLU(True),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(True))
def forward(self, x):
out, _ = self.gru(x)
out = self.fc(out)
return out
class CNN_GRU(nn.Module):
def __init__(self, input_dim, output_dim):
super(CNN_GRU, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.feature_extractor = nn.Sequential(
nn.Conv2d(input_dim, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.MaxPool2d(kernel_size=(2, 2), stride=(2,2)),
nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(True),
nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
nn.Flatten()
)
self.gru_linear = GRULinear(16000, output_dim, 2, True)
def forward(self, state):
features=[]
for i in range(state.shape[1]):
features.append(self.feature_extractor(state[0][i]).unsqueeze(1))
features = torch.flatten(torch.cat(features, axis=1), start_dim=1)
features = torch.reshape(features, shape=(1,)+features.shape)
outs = torch.flatten(self.gru_linear(features))
return outs
TEST(using torchsummary (from torchsummary import summary)):
model = CNN_GRU(3,64).to('cuda')
summary(model, input_size=(125, 1, 3 , 16, 16))
OUTPUT SNIPPET:
================================================================
Total params: 59,758,160
Trainable params: 59,758,160
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.37
Forward/backward pass size (MB): 34.15
Params size (MB): 227.96
Estimated Total Size (MB): 262.48
----------------------------------------------------------------
FULL-OUTPUT :
I've tested above model using torchsymmary. it has a lot more trainable parameters than keras TimeDistributed model with the same parameters. i don't understand what I'm doing wrong. so what is proper way to mimic TimeDistributed?. if any other way possible then it would be grate to give some points.