I am running PyTorch implementation of this article (https://arxiv.org/pdf/1604.03650.pdf).
In the article, it says to initialize the deconv layers with bilinear interpolation which is not in the code. it is told that if in deconvolution, we are scaling the data by factor S, the initial weights are defined as: enter image description here
Does anyone know how can I implement it?
This is the neural network model designed based on the article:
In another word, I don't know how to initialization the deconvolution layers (for example deconv1 layer in this code).
import torch
import torch.nn as nn
import numpy as np
import torchvision
import torch.nn.functional as F
cfg = [64, 128, 256, 512, 512]
class Deep3d(nn.Module):
def __init__(self, in_channels=3, out_channels=3, device=torch.device('cpu')):
super(Deep3d, self).__init__()
self.device = device
vgg16 = torchvision.models.vgg16_bn(pretrained=True)
modules = []
layer = []
for l in vgg16.features:
if isinstance(l, nn.MaxPool2d):
layer.append(l)
modules.append(layer)
layer = []
else:
layer.append(l)
scale = 1
deconv = []
layer = []
for m in range(len(modules)):
layer.append(nn.Conv2d(cfg[m], cfg[m], kernel_size=3, stride=1, padding=True))
layer.append(nn.ReLU(inplace=True))
layer.append(nn.Conv2d(cfg[m], cfg[m], kernel_size=3, stride=1, padding=True))
layer.append(nn.ReLU(inplace=True))
if(m==0):
layer.append(nn.ConvTranspose2d(cfg[m], 65, kernel_size=1, stride=1, padding=(0,0)))
else:
scale *=2
layer.append(nn.ConvTranspose2d(cfg[m], 65, kernel_size=scale*2, stride=scale, padding=(scale//2, scale//2)))
deconv.append(layer) # add blocks of layers to deconv part of the network
layer = []
self.module_1 = nn.Sequential(*modules[0])
self.module_2 = nn.Sequential(*modules[1])
self.module_3 = nn.Sequential(*modules[2])
self.module_4 = nn.Sequential(*modules[3])
self.module_5 = nn.Sequential(*modules[4])
self.deconv_1 = nn.Sequential(*deconv[0])
self.deconv_2 = nn.Sequential(*deconv[1])
self.deconv_3 = nn.Sequential(*deconv[2])
self.deconv_4 = nn.Sequential(*deconv[3])
self.deconv_5 = nn.Sequential(*deconv[4])
self.linear_module = nn.Sequential(*[nn.Linear(15360,4096), # hyperparam choice
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096,1950)]) # 1950=65(disparity range)*10*3(10*3 is feature map size)
self.deconv_6 = nn.Sequential(*[nn.ConvTranspose2d(65,65,kernel_size=scale*2,stride=scale,padding=(scale//2,scale//2))])
self.upconv_final = nn.Sequential(*[nn.ConvTranspose2d(65,65,kernel_size=(4,4),stride=2,padding=(1,1)),
nn.ReLU(inplace=True),
nn.Conv2d(65,65,kernel_size=(3,3),stride=1,padding=(1,1)),
nn.Softmax(dim=1)])
for block in [self.deconv_1,self.deconv_2,self.deconv_3,self.deconv_4,self.deconv_5,self.deconv_6,self.linear_module,self.upconv_final]:
for m in block:
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def forward(self, orig_x, x):
x_copy = orig_x
pred = []
out_1 = self.module_1(x)
out_2 = self.module_2(out_1)
out_3 = self.module_3(out_2)
out_4 = self.module_4(out_3)
out_5 = self.module_5(out_4)
# print(out_5.shape)
out_5_flatten = out_5.view(x_copy.shape[0],-1)
out_6 = self.linear_module(out_5_flatten)
p1 = self.deconv_1(out_1)
p2 = self.deconv_2(out_2)
p3 = self.deconv_3(out_3)
p4 = self.deconv_4(out_4)
p5 = self.deconv_5(out_5)
# print(p5.shape)
p6 = self.deconv_6(out_6.view(x_copy.shape[0],65,3,10))
pred.append(p1)
pred.append(p2)
pred.append(p3)
pred.append(p4)
pred.append(p5)
pred.append(p6)
out = torch.zeros(pred[0].shape).to(self.device)
for p in pred:
out = torch.add(out, p)
out = self.upconv_final(out) # to be elt wise multiplied with shifted left views
out = F.interpolate(out,scale_factor=4,mode='bilinear') # upscale to match left input size
new_right_image = torch.zeros(x_copy.size()).to(self.device)
stacked_shifted_view = None
stacked_out = None
for depth_map_idx in range(-33,32):
shifted_input_view = torch.zeros(x_copy.size()).to(self.device)
if depth_map_idx<0:
shifted_input_view[:,:,:,:depth_map_idx] = x_copy[:,:,:,-depth_map_idx:]
elif depth_map_idx==0:
shifted_input_view = x_copy
else:
shifted_input_view[:,:,:,depth_map_idx:] = x_copy[:,:,:,:-depth_map_idx]
if stacked_shifted_view is None:
stacked_shifted_view = shifted_input_view.unsqueeze(1)
else:
stacked_shifted_view = torch.cat((stacked_shifted_view,shifted_input_view.unsqueeze(1)),dim=1)
if stacked_out is None:
stacked_out = out[:,depth_map_idx+33:depth_map_idx+34,:,:].unsqueeze(1)
else:
stacked_out = torch.cat((stacked_out,out[:,depth_map_idx+33:depth_map_idx+34,:,:].unsqueeze(1)),dim=1)
softmaxed_stacked_shifted_view = stacked_shifted_view
mult_soft_shift_out = torch.mul(stacked_out,softmaxed_stacked_shifted_view)
final_rt_image = torch.sum(mult_soft_shift_out,dim=1)
return final_rt_image
# if(__name__=='__main__'):
# vgg16 = torchvision.models.vgg16(pretrained=True)
# model = Deep3d().to(torch.device('cpu'))
# out = model(torch.randn(10,3,384,1280),torch.randn(10,3,96,320))