class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.bn1 = torch.nn.BatchNorm2d(num_features=3)
self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
self.act1 = torch.nn.ReLU()
self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr1 = torch.nn.Dropout2d(0.1)
self.bn2 = torch.nn.BatchNorm2d(num_features=16)
self.conv2 = torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.act2 = torch.nn.ReLU()
self.pool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr2 = torch.nn.Dropout2d(0.1)
self.bn3 = torch.nn.BatchNorm2d(num_features=32)
self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
self.act3 = torch.nn.ReLU()
self.pool3 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr3 = torch.nn.Dropout2d(0.1)
self.bn4 = torch.nn.BatchNorm1d(num_features=4 * 4 * 64)
self.fc4 = torch.nn.Linear(4 * 4 * 64, 256)
self.act4 = torch.nn.Tanh()
# self.dr4 = torch.nn.Dropout1d(0.1)
self.bn5 = torch.nn.BatchNorm1d(num_features=256)
self.fc5 = torch.nn.Linear(256, 64)
self.act5 = torch.nn.Tanh()
# self.dr5 = torch.nn.Dropout1d(0.1)
self.fc6 = torch.nn.Linear(64, 10)
def forward(self, x):
x = self.bn1(x)
x = self.conv1(x)
x = self.act1(x)
x = self.pool1(x)
# x = self.dr1(x)
x = self.bn2(x)
x = self.conv2(x)
x = self.act2(x)
x = self.pool2(x)
# x = self.dr2(x)
x = self.bn3(x)
x = self.conv3(x)
x = self.act3(x)
x = self.pool3(x)
# x = self.dr3(x)
x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
x = self.bn4(x)
x = self.fc4(x)
x = self.act4(x)
# x = self.dr4(x)
x = self.bn5(x)
x = self.fc5(x)
x = self.act5(x)
# x = self.dr5(x)
x = self.fc6(x)
return x
last_model = Net()
#%%
def conv_block(in_f, out_f, activation='relu', *args, **kwargs):
activations = nn.ModuleDict([
['tanh', nn.Tanh()],
['relu', nn.ReLU()]
])
return nn.Sequential(
nn.BatchNorm2d(in_f),
nn.Conv2d(in_f, out_f, *args, **kwargs),
activations[activation],
nn.MaxPool2d(kernel_size=2, stride=2),
# nn.Dropout2d(0.1)
)
class MyEncoder(nn.Module):
def __init__(self, enc_sizes, *args, **kwargs):
super().__init__()
self.conv_blokcs = nn.Sequential(*[conv_block(in_f,
out_f, kernel_size=3, padding=1, *args, **kwargs)
for in_f, out_f in zip(enc_sizes, enc_sizes[1:])])
def forward(self, x):
return self.conv_blokcs(x)
def dec_block(in_f, out_f):
return nn.Sequential(
nn.BatchNorm1d(in_f),
nn.Linear(in_f, out_f),
nn.Tanh(),
# nn.Dropout1d(0.1)
)
class MyDecoder(nn.Module):
def __init__(self, dec_sizes, n_classes):
super().__init__()
self.dec_blocks = nn.Sequential(*[dec_block(in_f, out_f)
for in_f, out_f in zip(dec_sizes, dec_sizes[1:])])
self.last = nn.Linear(dec_sizes[-1], n_classes)
def forward(self, x):
return self.dec_blocks(x)
class MyNET(nn.Module):
def __init__(self, in_c, enc_sizes, dec_sizes, n_classes, activation='relu'):
super().__init__()
self.enc_sizes = [in_c, *enc_sizes]
l = 32 / (2 ** len(enc_sizes))
# print(enc_sizes[-1] * l * l)
self.dec_sizes = [int(enc_sizes[-1] * l * l), *dec_sizes]
self.encoder = MyEncoder(self.enc_sizes, activation=activation)
self.decoder = MyDecoder(self.dec_sizes, n_classes)
def forward(self, x):
x = self.encoder(x)
x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
x = self.decoder(x)
return x
my_nodel = MyNET(3, [16, 32, 64], [256, 64], 10, activation='relu')
And results on 5 epochs CIFAR10:
tensor(0.6721)
tensor(0.7059)
tensor(0.7359)
tensor(0.7288)
tensor(0.7373)
---------------
tensor(0.4944)
tensor(0.5391)
tensor(0.5898)
tensor(0.6283)
tensor(0.6398)
train function:
def train(net, X_train, y_train, X_test, y_test):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = net.to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3, weight_decay=1e-5)
batch_size = 100
test_accuracy_history = []
test_loss_history = []
X_test = X_test.to(device)
y_test = y_test.to(device)
for epoch in range(5):
order = np.random.permutation(len(X_train))
for start_index in range(0, len(X_train), batch_size):
optimizer.zero_grad()
net.train()
batch_indexes = order[start_index:start_index+batch_size]
X_batch = X_train[batch_indexes].to(device)
y_batch = y_train[batch_indexes].to(device).view(-1)
preds = net.forward(X_batch)
loss_value = loss(preds, y_batch)
loss_value.backward()
optimizer.step()
net.eval()
test_preds = net.forward(X_test)
test_loss_history.append(loss(test_preds, y_test.squeeze()).data.cpu())
accuracy = (test_preds.argmax(dim=1) == y_test).float().mean().data.cpu()
test_accuracy_history.append(accuracy)
print(accuracy)
print('---------------')
return test_accuracy_history, test_loss_history
I was hoping that these are identical neural networks and they will produce the same results. I thought the problem was in the training itself, but if you train first the second and then the first, the results are the same. In the code, I specifically disabled the dropout so that it does not accidentally turn off the neurons (although the random seed is the same). Perhaps the problem is that when gradients are calculated, they are somehow calculated in blocks differently than in the usual form???
Your
forwardmethod in yourMyDecodermodule skips the final linear layer.As written, the first model produces an output of size
(bs, 10)while the second model produces an output of size(bs, 64).