Pytorch Model trained with Lightining has loss stuck at a baseline-CodePudding

I am building a model for the CIFAR10 dataset.

My model starts with a high loss but only goes to 2.3. Given there are 10 classes and it's logs natural for the loss, it's only giving 10% accuracy (ln(10)=~2.3). I am switching from keras/tensorflow and am lost to what I am doing wrong. Any help/advice/resources would be appreciated.

Model



class Net(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(3, 6, 3), nn.ReLU())
        self.conv2 = nn.Sequential(nn.Conv2d(6, 12, 3), nn.ReLU())
        self.conv3 = nn.Sequential(nn.Conv2d(12, 24, 3), nn.ReLU())
        self.conv4 = nn.Sequential(nn.Conv2d(24, 128, 3), nn.ReLU())
        self.conv5 = nn.Sequential(nn.Conv2d(128, 256, 3), nn.ReLU())
        self.conv6 = nn.Sequential(nn.Conv2d(256, 256, 3), nn.ReLU())
        self.conv7 = nn.Sequential(nn.Conv2d(256, 512, 3), nn.ReLU())
        self.conv8 = nn.Sequential(nn.Conv2d(512, 512, 3), nn.ReLU())
        self.conv9 = nn.Sequential(nn.Conv2d(512, 512, 3), nn.ReLU(), nn.MaxPool2d(2, 2))
        self.fc1 = nn.Linear(25088, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 84)
        self.last = nn.Linear(84, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)
        x = self.conv9(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.last(x)
        return x

    def training_step(self, batch, batch_nb):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        return loss
    def test_step(self,batch,batch_nb):

        x,y = batch
        loss = F.cross_entropy(self(x),y)
        return loss
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

Then I simply do

trainer = pl.Trainer(max_epochs=15, gpus=1)
trainer.fit(net,train_dataloaders=trainloader)

Additionally, when printing out predictions, it always prints "truck".

My dataloading is near identical to the one on the PyTorch tutorial.

transform = transforms.Compose([
    transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
batch_size = 128
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

CodePudding user response：

I change some of your code, and after 15 epochs, I get a 0.866 in loss.

For the CIFAR10 dataset, you do not need to create a large network. (This large network needs more time to train. Maybe, for this reason, the loss of your network slowly decreases.)

# !pip install pytorch_lightning 

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision import transforms
import pytorch_lightning as pl

class Net(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        loss = F.cross_entropy(self(x),y)
        return loss

# data
transform = transforms.Compose([
    transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
batch_size = 128
trainset = CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = CIFAR10(root='./data', train=False,download=True, transform=transform)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# model
model = Net()

# training
trainer = pl.Trainer(max_epochs=15, gpus=1)
trainer.fit(model, trainloader, testloader)

Output:

Files already downloaded and verified
Files already downloaded and verified
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | conv1 | Conv2d    | 456   
1 | pool  | MaxPool2d | 0     
2 | conv2 | Conv2d    | 2.4 K 
3 | fc1   | Linear    | 48.1 K
4 | fc2   | Linear    | 10.2 K
5 | fc3   | Linear    | 850   
------------------------------------
62.0 K    Trainable params
0         Non-trainable params
62.0 K    Total params
0.248     Total estimated model params size (MB)
cuda:0
Epoch 14: 100%...470/470 [00:14<00:00, 32.84it/s, loss=0.864, v_num=1]