I am trying to save images that I configure during training to the output bucket in sagemaker. I've read that all the information that needs to be saved during training goes into the model.tar.gz file. I've tried saving plots using the model_dir and the output_data_dir to no avail. The model itself is saved properly, but the additional information is not being stored with it. I want to reload this additional information (the saved images) during inference but have heard that storing all the information in the model.tar.gz can cause slow inference. I would love some help.
from sagemaker.pytorch import PyTorch
estimator = PyTorch(entry_point='XXXXXXXX/AWS/mnist.py',
role=role,
py_version='py3',
framework_version='1.8.0',
instance_count=1,
instance_type='ml.c5.xlarge',
output_path='s3://XXXXX-bucket/',
)
# mnist.py
import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import argparse
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch import nn
import matplotlib.pyplot as plt
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X.to(device))
loss = loss_fn(pred, y.to(device))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
pred = model(X.to(device))
test_loss = loss_fn(pred, y.to(device)).item()
correct = (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Initialize the loss function
if __name__=='__main__':
# default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable.
parser = argparse.ArgumentParser()
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
args, _ = parser.parse_known_args()
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
labels_map = {
0: "T-Shirt",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows 1):
sample_idx = torch.randint(len(training_data), size=(1,)).item()
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imsave(args.output_data_dir 'plot' str(i) '.jpg', img.squeeze(), cmap="gray")
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imsave(args.output_data_dir 'sample.jpg', img, cmap="gray")
print("Saved img.")
print(f"Label: {label}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = NeuralNetwork().to(device)
print(model)
learning_rate = 1e-3
batch_size = 64
epochs = 5
# ... train `model`, then save it to `model_dir`
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 1
for t in range(epochs):
print(f"Epoch {t 1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
torch.save(model.state_dict(), f)
plt.plot([1,2,3,4])
plt.ylabel('some numbers')
plt.show()
plt.savefig('test.jpeg')
CodePudding user response:
I suspect there is an issue with string concatenation in plt.imsave
because the environment variable SM_OUTPUT_DATA_DIR
by default points to /opt/ml/output/data
(that's the actual value of args.output_data_dir
, since you don't pass this parameter) so the outcome is something like /opt/ml/output/dataplot1.jpg
. The same happen if you use the model_dir
in the same way. I'd rather use something like os.path.join
like you're already doing for the model. here a nice exaplaination about these folders and environment variables in sagemaker.