I want to fine-tune a pre-trained DistilBERT (transformer model based on the BERT architecture) model available Hugging Face. I did some data cleaning up/pre-processing step to generate a csv data and uploaded to an s3 bucket.
based on the example provided here (https://github.com/aws-samples/finetune-deploy-bert-with-amazon-sagemaker-for-hugging-face) , the code below is a train.py file .
I have a couple of csv file that i want to use for training and testing. in the code below, it looks like , it is loading data as below, how can i change this to be able to read and use csv, given csv is an s3 location.
train_dataset = load_from_disk(args.training_dir)
"""
Training script for Hugging Face SageMaker Estimator
"""
import logging
import sys
import argparse
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--model_name", type=str)
parser.add_argument("--tokenizer_name", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
# Data, model, and output directories
parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
args, _ = parser.parse_known_args()
# load datasets
train_dataset = load_from_disk(args.training_dir)
test_dataset = load_from_disk(args.test_dir)
# download model and tokenizer from model hub
model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
# define training args
training_args = TrainingArguments(
output_dir=args.model_dir,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.train_batch_size,
per_device_eval_batch_size=args.eval_batch_size,
warmup_steps=args.warmup_steps,
evaluation_strategy="epoch",
logging_dir=f"{args.output_data_dir}/logs",
learning_rate=float(args.learning_rate),
)
# create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
)
# train model
trainer.train()
...
...
CodePudding user response:
You can pass S3 remote url to the function load_from_disk
.
The argument is dataset_path described below.
dataset_path (str) — Path (e.g. "dataset/train") or remote URI (e.g. "s3//my-bucket/dataset/train") of the dataset directory where the dataset will be loaded from. Reference - https://huggingface.co/docs/datasets/v2.8.0/en/package_reference/main_classes#datasets.Dataset.load_from_disk
from datasets import load_from_disk
# load encoded_dataset from cloud storage
dataset = load_from_disk("s3://a-public-datasets/imdb/train", storage_options=storage_options)
print(len(dataset))
25000
In order to pass the S3 session details, you can look at the documentation below. https://huggingface.co/docs/datasets/filesystems#amazon-s3
storage_options = {"anon": True} # for anonymous connection
# or use your credentials
storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key} # for private buckets
# or use a botocore session
import botocore
s3_session = botocore.session.Session(profile="my_profile_name")
storage_options = {"session": s3_session}