VGG implemmentation problem

I am encountering significant issues with my own VGG implementation during training:

  • It fails to converge, with the loss consistently around 2.3.
  • The memory usage is extremely high, exceeding the 12GB capacity of my RTX 4070. Theoretically, the memory usage should not exceed 7GB. (That’s what I got when using d2l notebooks.
  • The speed is very slow, which I suspect is due to data being moved to CUDA.

Can anyone help me identify what might be going wrong?


from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch


class VGG(nn.Module):
    def __init__(self, in_channels, num_classes=10):
        super(VGG, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(512*7*7, 4096), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.net(x)
        return x

# use xavier initialization
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


def train(model:nn.Module, criterion, optimizer, train_loader, epochs=1):
    model.train()
    for epoch in range(epochs):
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            if use_gpu:
                X, y = X.cuda(), y.cuda()
            y_hat = model(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print(f"epoch {epoch}, batch {i}, loss {loss.item()}")

def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in test_loader:
            if use_gpu:
                X, y = X.cuda(), y.cuda()
            y_hat = model(X)
            _, predicted = torch.max(y_hat, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    print(f"accuracy: {correct/total}")


use_gpu = True

train_data = datasets.FashionMNIST(root="Beginners/data", train=True, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()]))
test_data = datasets.FashionMNIST(root="Beginners/data", train=False, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()]))
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

model = VGG(in_channels=1)
model.apply(init_weights)
if use_gpu:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

train(model, criterion, optimizer, train_loader, epochs=10)
test(model, test_loader)