I am building a model for the CIFAR10 dataset.
My model starts with a high loss but only goes to 2.3. Given there are 10 classes and it's logs natural for the loss, it's only giving 10% accuracy (ln(10)=~2.3). I am switching from keras/tensorflow and am lost to what I am doing wrong. Any help/advice/resources would be appreciated.
Model
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
self.conv1 = nn.Sequential(nn.Conv2d(3, 6, 3), nn.ReLU())
self.conv2 = nn.Sequential(nn.Conv2d(6, 12, 3), nn.ReLU())
self.conv3 = nn.Sequential(nn.Conv2d(12, 24, 3), nn.ReLU())
self.conv4 = nn.Sequential(nn.Conv2d(24, 128, 3), nn.ReLU())
self.conv5 = nn.Sequential(nn.Conv2d(128, 256, 3), nn.ReLU())
self.conv6 = nn.Sequential(nn.Conv2d(256, 256, 3), nn.ReLU())
self.conv7 = nn.Sequential(nn.Conv2d(256, 512, 3), nn.ReLU())
self.conv8 = nn.Sequential(nn.Conv2d(512, 512, 3), nn.ReLU())
self.conv9 = nn.Sequential(nn.Conv2d(512, 512, 3), nn.ReLU(), nn.MaxPool2d(2, 2))
self.fc1 = nn.Linear(25088, 1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 512)
self.fc4 = nn.Linear(512, 84)
self.last = nn.Linear(84, 10)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.conv6(x)
x = self.conv7(x)
x = self.conv8(x)
x = self.conv9(x)
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = self.last(x)
return x
def training_step(self, batch, batch_nb):
x, y = batch
loss = F.cross_entropy(self(x), y)
return loss
def test_step(self,batch,batch_nb):
x,y = batch
loss = F.cross_entropy(self(x),y)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=0.02)
Then I simply do
trainer = pl.Trainer(max_epochs=15, gpus=1)
trainer.fit(net,train_dataloaders=trainloader)
Additionally, when printing out predictions, it always prints "truck".
My dataloading is near identical to the one on the PyTorch tutorial.
transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
batch_size = 128
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat',
'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
CodePudding user response:
I change some of your code, and after 15 epochs, I get a 0.866 in loss
.
For the CIFAR10 dataset, you do not need to create a large network. (This large network needs more time to train. Maybe, for this reason, the loss of your network slowly decreases.)
# !pip install pytorch_lightning
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision import transforms
import pytorch_lightning as pl
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
return optimizer
def training_step(self, batch, batch_idx):
x, y = batch
loss = F.cross_entropy(self(x), y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
loss = F.cross_entropy(self(x),y)
return loss
# data
transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
batch_size = 128
trainset = CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
testset = CIFAR10(root='./data', train=False,download=True, transform=transform)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# model
model = Net()
# training
trainer = pl.Trainer(max_epochs=15, gpus=1)
trainer.fit(model, trainloader, testloader)
Output:
Files already downloaded and verified
Files already downloaded and verified
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
------------------------------------
0 | conv1 | Conv2d | 456
1 | pool | MaxPool2d | 0
2 | conv2 | Conv2d | 2.4 K
3 | fc1 | Linear | 48.1 K
4 | fc2 | Linear | 10.2 K
5 | fc3 | Linear | 850
------------------------------------
62.0 K Trainable params
0 Non-trainable params
62.0 K Total params
0.248 Total estimated model params size (MB)
cuda:0
Epoch 14: 100%...470/470 [00:14<00:00, 32.84it/s, loss=0.864, v_num=1]