This is my pytorch architecture, it takes colored 3x256x256 images as input
class AutoEncoder(nn.Module):
def __init__(self,
channels : int,
latent_dim : int):
super().__init__()
ifac = 2
lw = 2
im = 2
self.encoder = nn.Sequential(
nn.Conv2d(channels, channels*2**1*ifac, kernel_size=3, padding=1, stride=2), #n 6 128 128
nn.Tanh(),
nn.Conv2d(channels*2**1*ifac, channels*2**2*ifac, kernel_size=3, padding=1, stride=2), #n 12 64 64
nn.Tanh(),
nn.Conv2d(channels*2**2*ifac, channels*2**3*ifac, kernel_size=3, padding=1, stride=2), #n 24 32 32
nn.Tanh(),
nn.Conv2d(channels*2**3*ifac, channels*2**4*ifac, kernel_size=3, padding=1, stride=2), #n 48 16 16
nn.Tanh(),
nn.Conv2d(channels*2**4*ifac, channels*2**5*ifac, kernel_size=3, padding=1, stride=2), #n 96 8 8
nn.Tanh(),
nn.Conv2d(channels*2**5*ifac, channels*2**6*ifac, kernel_size=3, padding=1, stride=2), #n 192 4 4
nn.Tanh(),
nn.Conv2d(channels*2**6*ifac, channels*2**7*ifac, kernel_size=3, padding=1, stride=2), #n 384 2 2
nn.Tanh(),
nn.Conv2d(channels*2**7*ifac, channels*2**8*ifac*lw, kernel_size=2, padding=0, stride=1), #n 768 1 1
nn.Tanh(),
nn.Flatten(),
nn.Linear(channels*2**8*ifac*lw, 1024*im),
nn.Tanh(),
#nn.Linear(1024*im, latent_dim),
#nn.Tanh(),
).cuda()
self.decoder = nn.Sequential(
#nn.Linear(latent_dim, 1024*im),
#nn.Tanh(),
nn.Linear(1024*im, channels*2**8*ifac*lw),
nn.Tanh(),
Reshape(-1, channels*2**8*ifac*lw, 1, 1),
nn.ConvTranspose2d(channels*2**8*ifac*lw, channels*2**7*ifac, kernel_size=2, padding=0, output_padding=0, stride=1), #n 384 2 2
nn.Tanh(),
nn.ConvTranspose2d(channels*2**7*ifac, channels*2**6*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 192 4 4
nn.Tanh(),
nn.ConvTranspose2d(channels*2**6*ifac, channels*2**5*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 96 8 8
nn.Tanh(),
nn.ConvTranspose2d(channels*2**5*ifac, channels*2**4*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 48 16 16
nn.Tanh(),
nn.ConvTranspose2d(channels*2**4*ifac, channels*2**3*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 24 32 32
nn.Tanh(),
nn.ConvTranspose2d(channels*2**3*ifac, channels*2**2*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 12 64 64
nn.Tanh(),
nn.ConvTranspose2d(channels*2**2*ifac, channels*2**1*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 6 128 128
nn.Tanh(),
nn.ConvTranspose2d(channels*2**1*ifac, channels*2**0*ifac, kernel_size=3, padding=1, output_padding=1, stride=2), #n 3 256 256
nn.Tanh(),
nn.ConvTranspose2d(channels*2**0*ifac, channels, kernel_size=3, padding=1, stride=1), #n 3 256 256
nn.Sigmoid()
).cuda()
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
Training loop looks like this:
while True:
avgloss = 0.0
fac = 1.0 / len(tensor_batches)
for batch_tensor in tensor_batches:
#print(batch_tensor.size()) # -> torch.Size([1, 3, 256, 256])
output_tensor = model(batch_tensor)
loss = criterion(output_tensor, batch_tensor)
avgloss = avgloss + loss.item()*fac
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(f"--- Loss: {avgloss:.4f} next epoch ---")
And produces images like this after the training converges (at a very high loss of like 0.1):
I would at least expect some random shapes and clouds in the output instead of a single color. Also before training, shouldn't it produce a completely noisy (not repetitive) image with just the random weights ?
It only starts to somewhat generate something different (a blocky blurry version of the input image) if I remove all those smaller layers but the last 1 or 2.
I tried different activation functions, loss functions, optimizers and a bunch of different parameters.
Even if I train on like 4 pictures it cant produce anything else but a single average noisy color.
It looks like whenever I add the third layer (channels*2**2*ifac
to channels*2**3*ifac
) it breaks. I can even increase the channel size by a factor of 128, it doesnt make a difference.
How can I get a one dimensional latent space architecture with 256x256 images to work?