VGG 实现问题

我自己的VGG实现在训练的时候出现很大问题:

  1. 无法收敛,Loss始终保持在2.3左右
  2. 显存占用极高,超出了RTX4070 12G显存的容量。理论上其显存占用应当不超过7G
  3. 速度极慢,猜测是将数据移动到CUDA导致的

请问大家能帮我看看哪里出了错吗?

from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch

# do not use lazy api

class VGG(nn.Module):
    def __init__(self, in_channels, num_classes=10):
        super(VGG, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(256, 512, 3, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(512*7*7, 4096), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.net(x)
        return x

# use xavier initialization
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


def train(model:nn.Module, criterion, optimizer, train_loader, epochs=1):
    model.train()
    for epoch in range(epochs):
        for i, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            if use_gpu:
                X, y = X.cuda(), y.cuda()
            y_hat = model(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print(f"epoch {epoch}, batch {i}, loss {loss.item()}")

def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in test_loader:
            if use_gpu:
                X, y = X.cuda(), y.cuda()
            y_hat = model(X)
            _, predicted = torch.max(y_hat, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    print(f"accuracy: {correct/total}")


use_gpu = True

train_data = datasets.FashionMNIST(root="Beginners/data", train=True, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()]))
test_data = datasets.FashionMNIST(root="Beginners/data", train=False, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()]))
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

model = VGG(in_channels=1)
model.apply(init_weights)
if use_gpu:
    model = model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

train(model, criterion, optimizer, train_loader, epochs=10)
test(model, test_loader)