Hey,
I'm working on vascular detection using representation learning based on medical ultrasound data.
My aim is to analyse if the use of DCNv2 has benefits compared to the use of regular convolution.
For that I created auto-encoders with and without deformable convolutional layers.
So far using deformable convolution lead to worse results than using regular convolution.
That's not what I was expecting after reading multiple papers and articles which are praising DCN. (i.e. DCN for MRI classification, msracver DCNv2 implementation, towardsdatascience article)
I was hoping someone could give me some hints how I could improve architecture of the auto-encoder, so that training with deformable convolution yields to better results.
Training specifications:
training_dataset size=45.000 % patches
patch_size=(24,24) % in pixels
batch_size=128
latent_space_dimension=128
epochs=500
learning_rate=0.001
normalize=false % patches have values between 0 and 255
loss_critesion=Mean_Squared_Error
optimizer=torch.optim.SGD()
Architectures:
The results of the training runs has been uploaded to Weights and Biases (see links).
regular convolution
Results of training using regular convolution.
class Cnn53maxNormDropModelEncoder(torch.nn.Module):
def __init__(self, num_classes, bias):
super(Cnn53maxNormDropModelEncoder, self).__init__()
self.conv1 = self.conv1_block(1, 32)
self.pool1 = torch.nn.MaxPool2d((2, 2), return_indices=True)
self.conv2 = self.conv2_block(32, 64)
self.pool2 = torch.nn.MaxPool2d((2, 2), return_indices=True)
self.fc1 = torch.nn.Linear(1024, 128)
self.relu = torch.nn.LeakyReLU()
self.norm = torch.nn.BatchNorm1d(128)
self.drop = torch.nn.Dropout(p=0.15)
self.fc2 = torch.nn.Linear(128, num_classes)
@staticmethod
def conv1_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.Conv2d(in_c, out_c, kernel_size=(5, 5)),
torch.nn.LeakyReLU()
)
@staticmethod
def conv2_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.Conv2d(in_c, out_c, kernel_size=(3, 3)),
torch.nn.LeakyReLU()
)
def forward(self, x): # 24
out = self.conv1(x) # 20
size1 = out.size()
out, indices1 = self.pool1(out) # 10
out = self.conv2(out) # 8
size2 = out.size()
out, indices2 = self.pool2(out) # 4
out = out.view(out.size(0), -1)
out = self.fc1(out)
out = self.relu(out)
out = self.norm(out)
out = self.drop(out)
out = self.fc2(out)
return out, size1, size2, indices1, indices2
class Cnn53maxNormDropModelDecoder(torch.nn.Module):
def __init__(self, num_classes, bias):
super(Cnn53maxNormDropModelDecoder, self).__init__()
self.fc2 = torch.nn.Linear(num_classes, 128)
self.drop = torch.nn.Dropout(p=0.15)
self.norm = torch.nn.BatchNorm1d(128)
self.relu = torch.nn.LeakyReLU()
self.fc1 = torch.nn.Linear(128, 1024)
self.pool2 = torch.nn.MaxUnpool2d((2, 2))
self.conv2 = self.conv2_block(64, 32)
self.pool1 = torch.nn.MaxUnpool2d((2, 2))
self.conv1 = self.conv1_block(in_c=32, out_c=1)
@staticmethod
def conv1_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.LeakyReLU(),
torch.nn.ConvTranspose2d(in_c, out_c, kernel_size=(5, 5))
)
@staticmethod
def conv2_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.LeakyReLU(),
torch.nn.ConvTranspose2d(in_c, out_c, kernel_size=(3, 3))
)
def forward(self, encoded, size1, size2, pool1, pool2):
out = self.fc2(encoded)
out = self.drop(out)
out = self.norm(out)
out = self.relu(out)
out = self.fc1(out)
out = out.view(out.size(0), 64, 4, 4)
out = self.pool2(out, pool2, output_size=size2)
out = self.conv2(out)
out = self.pool1(out, pool1, output_size=size1)
out = self.conv1(out)
return out
class Cnn53maxNormDropModel(torch.nn.Module):
def __init__(self, model_encoder, model_decoder):
super(Cnn53maxNormDropModel, self).__init__()
self.encoder = model_encoder
self.decoder = model_decoder
def forward(self, x):
encoded, size1, size2, indices1, indices2 = self.encoder(x)
decoded = self.decoder(encoded, size1, size2, indices1, indices2)
return decoded
deformable convolution
Swapping second regular convolutional layer for deformable convolution.
Results of training using deformable convolution.
class Dcn53dmaxNormDropModelEncoder(torch.nn.Module):
def __init__(self, num_classes, bias):
super(Dcn53dmaxNormDropModelEncoder, self).__init__()
self.conv1 = self.conv1_block(1, 32)
self.pool1 = torch.nn.MaxPool2d((2, 2), return_indices=True)
self.conv2 = self.conv2_block(32, 64, bias)
self.pool2 = torch.nn.MaxPool2d((2, 2), return_indices=True)
self.fc1 = torch.nn.Linear(1024, 128)
self.relu = torch.nn.LeakyReLU()
self.norm = torch.nn.BatchNorm1d(128)
self.drop = torch.nn.Dropout(p=0.15)
self.fc2 = torch.nn.Linear(128, num_classes)
@staticmethod
def conv1_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.Conv2d(in_c, out_c, kernel_size=(5, 5)),
torch.nn.LeakyReLU()
)
@staticmethod
def conv2_block(in_c, out_c, bias):
return torch.nn.Sequential(
DeformableConv2d(in_c, out_c, kernel_size=3, stride=1, padding=0, bias=bias),
torch.nn.LeakyReLU()
)
def forward(self, x): # 24
out = self.conv1(x) # 20
size1 = out.size()
out, indices1 = self.pool1(out) # 10
out = self.conv2(out) # 8
size2 = out.size()
out, indices2 = self.pool2(out) # 4
out = out.view(out.size(0), -1)
out = self.fc1(out)
out = self.relu(out)
out = self.norm(out)
out = self.drop(out)
out = self.fc2(out)
return out, size1, size2, indices1, indices2
class Dcn53dmaxNormDropModelDecoder(torch.nn.Module):
def __init__(self, num_classes, bias):
super(Dcn53dmaxNormDropModelDecoder, self).__init__()
self.fc2 = torch.nn.Linear(num_classes, 128)
self.drop = torch.nn.Dropout(p=0.15)
self.norm = torch.nn.BatchNorm1d(128)
self.relu = torch.nn.LeakyReLU()
self.fc1 = torch.nn.Linear(128, 1024)
self.pool2 = torch.nn.MaxUnpool2d((2, 2))
self.conv2 = self.conv2_block(64, 32, bias)
self.pool1 = torch.nn.MaxUnpool2d((2, 2))
self.conv1 = self.conv1_block(in_c=32, out_c=1)
@staticmethod
def conv1_block(in_c, out_c):
return torch.nn.Sequential(
torch.nn.LeakyReLU(),
torch.nn.ConvTranspose2d(in_c, out_c, kernel_size=(5, 5))
)
@staticmethod
def conv2_block(in_c, out_c, bias):
return torch.nn.Sequential(
torch.nn.LeakyReLU(),
DeformableConv2d(in_c, out_c, kernel_size=3, stride=1, padding=2, bias=bias),
)
def forward(self, encoded, size1, size2, pool1, pool2):
out = self.fc2(encoded)
out = self.drop(out)
out = self.norm(out)
out = self.relu(out)
out = self.fc1(out)
out = out.view(out.size(0), 64, 4, 4)
out = self.pool2(out, pool2, output_size=size2)
out = self.conv2(out)
out = self.pool1(out, pool1, output_size=size1)
out = self.conv1(out)
return out
class Dcn53dmaxNormDropModel(torch.nn.Module):
def __init__(self, model_encoder, model_decoder):
super(Dcn53dmaxNormDropModel, self).__init__()
self.encoder = model_encoder
self.decoder = model_decoder
def forward(self, x):
encoded, size1, size2, indices1, indices2 = self.encoder(x)
decoded = self.decoder(encoded, size1, size2, indices1, indices2)
return decoded
More training results:
I've also tried architectures without using pooling and instead use a total of 5 convolutional layers with kernel size (5,5).
So I replaced:
def forward(self, x): # 24
out = self.conv1(x) # 20
size1 = out.size()
out, indices1 = self.pool1(out) # 10
out = self.conv2(out) # 8
size2 = out.size()
out, indices2 = self.pool2(out) # 4
with the following:
def forward(self, x): # 24
out = self.conv1(x) # 20
out = self.conv2(out) # 16
out = self.conv3(out) # 12
out = self.conv4(out) # 8
out = self.conv5(out) # 4
where each self.convX(out) is a block containing convolution, batch-normalization and LeakyReLu.
Full architecture attached. (without deformable: Cnn55555NormLeaky AND with deformable: DcnEnc5d5d555NormLeaky)
models_sandbox.txt
Training results with regular convolutional layers is fine. Results CNN
Training results with two deformable layers for conv1 and conv2 (only in the encoder) is significantly worse. Results DCN
I'm just replacing the regular convolutional layers with deformable convolution. To me it seems as if there's something wrong with the deformable convolution layer but I don't know what I should change to make it work.
I've attached the DCNv2 implementation I'm using. It's the implementation by developer0hye.
dcn2d.txt
I'm relatively new to ML and might just overlook something pretty trivial...
Can you help me?