I have a code, with it, I wanted to train a neural network and save the finished model as a file. But I am getting an error due to incorrect distribution of training and training data. Can't understand why: `import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class ChatBot(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super().__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden):
out, hidden = self.lstm(x, hidden)
out = self.fc(out[:, -1, :])
return out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
return hidden
class ChatDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
def train(model, train_loader, loss_fn, optimizer, device):
model.train()
for inputs, targets in train_loader:
inputs = inputs.to(device)
targets = targets.to(device)
hidden = model.init_hidden(inputs.size(0))
hidden = tuple([each.data for each in hidden])
optimizer.zero_grad()
outputs, _ = model(inputs, hidden)
loss = loss_fn(outputs.view(-1), targets.view(-1))
loss.backward()
optimizer.step()
def evaluate(model, val_loader, loss_fn, device):
model.eval()
total_loss = 0
with torch.no_grad():
for inputs, targets in val_loader:
inputs = inputs.to(device)
targets = targets.to(device)
hidden = model.init_hidden(inputs.size(0))
hidden = tuple([each.data for each in hidden])
outputs, _ = model(inputs, hidden)
total_loss = loss_fn(outputs, targets).item()
return total_loss / len(val_loader)
device = torch.device("cuda" if
torch.cuda.is_available() else "cpu")
input_size = 500
hidden_size = 128
num_layers = 2
output_size = 500
model = ChatBot(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
data = [("Hi, how are you?", "I'm doing well, thank you for asking."),
("What's your name?", "I'm a chatbot, I don't have a name."),
("What's the weather like?", "I'm not sure, I don't have access to current weather information."),
("What's the time?", "I'm not sure, I don't have access to the current time.")]
dataset = ChatDataset(data)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 100
for epoch in range(num_epochs):
train(model, train_loader, loss_fn, optimizer, device)
val_loss = evaluate(model, val_loader, loss_fn, device)
print("Epoch [{}/{}], Validation Loss: {:.4f}".format(epoch 1, num_epochs, val_loss))
torch.save(model.state_dict(), 'chatbot_model.pt')`
But, when I start this code, I have an error:
` ValueError
Traceback (most recent call last)
<ipython-input-8-ae2a6dd1bc7c> in
<module>
78 dataset = ChatDataset(data)
79
---> 80 train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])
81
82 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataset.py in random_split(dataset, lengths, generator)
345 # Cannot verify that dataset is Sized
346 if sum(lengths) != len(dataset): # type: ignore[arg-type]
--> 347 raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
348
349 indices = randperm(sum(lengths), generator=generator).tolist() # type: ignore[call-overload]
ValueError: Sum of input lengths does not equal the length of the input dataset!`
I don't know, why this error. Everything seems to be correct.
CodePudding user response:
I suspect there could be a loss of precision in this calculation,
[int(0.8 * len(dataset)), int(0.2 * len(dataset))]
so the number of records in the dataset is not fully accounted for.
for example:
int(.8 * 56) int(.2 * 56) = 55
CodePudding user response:
The typecasting of the values to an integer is causing a difference in the total number of images in the dataset and the distribution of the number of images in train and test.
Not the most ideal code, but replacing it with the following will work :
num_test_images = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [num_test_images, len(dataset) - num_test_images])