I am using AG News Dataset to train model for using text classification.
The part using TabularDataset
to generate dataset from csv
file.
import torchtext
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
import spacy
def des_tokenize(x):
return x.split(' ')
def title_tokenize(x):
return x.split(' ')
def category_tokenize(x):
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CATEGORY = Field(tokenize=category_tokenize)
TITLE = Field(tokenize=title_tokenize, init_token='<SOS>', eos_token='<EOS>')
DES = Field(tokenize=des_tokenize, init_token='<SOS>', eos_token='<EOS>')
spacy_en = spacy.load('en_core_web_sm')
train_fields = [('id', None), ('category', CATEGORY), ('title', TITLE), ('description', DES)]
test_fields = [('title', TITLE), ('description', DES)]
train_data = TabularDataset(
path = '/content/drive/MyDrive/summer2/train.csv',
format = 'csv',
fields = train_fields,
skip_header = True)
test_data = TabularDataset(
path = '/content/drive/MyDrive/summer2/test.csv',
format = 'csv',
fields = test_fields,
skip_header = True)
After dataset being generated, choosing to use pre-train embedding model called
torchtext.vocab.GloVe
to build vocab
.
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
train_batch_size = 10
test_batch_size = 1
max_length = 256
tokenizer = get_tokenizer('basic_english')
train_iter = torchtext.legacy.data.BucketIterator(
train_data,
batch_size=train_batch_size,
)
test_iter = torchtext.legacy.data.BucketIterator(
test_data,
batch_size=test_batch_size,
)
DES.build_vocab(
train_data,
vectors=torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=50_000),
max_size=50_000,
)
TITLE.build_vocab(
train_data,
vectors=torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=50_000),
max_size=50_000,
)
CATEGORY.build_vocab(train_data)
And the output looks great after create_batches
function
def create_batches(self):
self.batches = batch(self.data(), self.batch_size, self.batch_size_fn)
# Create batches - needs to be called before each loop.
train_iter.create_batches()
# Loop through BucketIterator.
print('PyTorchText BuketIterator\n')
for batch in train_iter.batches:
# Let's check batch size.
print('Batch size: %d\n'% len(batch))
print('category\ttitle\tdescription'.ljust(10))
# Print each example.
for example in batch:
print('%s \t %s \t %s'.ljust(10) % (example.category, example.title, example.description))
print('\n')
# Only look at first batch. Reuse this code in training models.
break
Output looks like
PyTorchText BuketIterator
Batch size: 10
category title description
2 ['UPDATE', '1-Open-Rejuvenated', 'Haas', 'reaches', 'last', 'eight'] ['Germany', '#39;s', 'Tommy', 'Haas', 'continued', 'his', 'resurgence', 'with', 'a', '7-6', '6-1', '7-5', 'victory', 'over', 'Czech', 'teenager', 'Tomas', 'Berdych', 'on', 'Tuesday', 'to', 'reach', 'the', 'quarter-finals', 'of', 'the', 'US', 'Open', 'for', 'the', 'first', 'time.']
3 ['Japan', '#39;s', 'Nikkei', 'Average,', 'Topix', 'Advance;', 'Toyota,', 'Advantest', 'Gain'] ['Japan', '#39;s', 'Nikkei', '225', 'Stock', 'Average', 'rose', '56.74,', 'or', '0.5', 'percent,', 'to', '11,139.97', 'at', '9:01', 'am', 'in', 'Tokyo.', 'The', 'broader', 'Topix', 'index', 'gained', '5.35,', 'or', '0.5', 'percent,', 'to', '1132.']
2 ['Wildcats', 'on', 'the', 'rise', 'with', 'Santos'] ['The', 'University', 'of', 'New', "Hampshire's", 'impressive', '51-40', 'road', 'victory', 'over', '10th-ranked', 'Villanova', 'Saturday', 'night', 'vaulted', 'the', 'Wildcats', 'three', 'spots', 'to', 'ninth', 'in', 'this', "week's", 'Sports', 'Network', '1-AA', 'football', 'poll,', 'while', 'dropping', 'Villanova', 'to', '14th.']
1 ['Cracking', 'under', 'the', 'strain'] ['Severe', 'cracks', 'surfaced', 'inside', 'the', 'Israeli', 'government', 'this', 'week', 'as', 'its', 'senior', 'law', 'officers', 'publicly', 'fell', 'out', 'with', 'the', 'defence', 'establishment', 'and', 'the', 'Foreign', 'Ministry', 'over', 'the', 'country', '#39;s', 'future', 'strategy', 'in', 'the', 'face', 'of', 'the', 'July', 'verdict', 'of', 'the', 'International', '']
1 ['Arab', 'League', 'to', 'hold', 'emergency', 'meeting'] ['The', 'Arab', 'League', 'says', 'it', 'will', 'hold', 'an', 'emergency', 'session', 'to', 'discuss', 'the', 'violence', 'in', 'Gaza,', 'which', 'has', 'claimed', 'at', 'least', '56', 'Palestinians', 'this', 'week.']
2 ['Holmes', 'to', 'decide', 'on', 'double'] ['Kelly', 'Holmes', 'has', 'still', 'to', 'confirm', 'whether', 'she', 'will', 'attempt', 'to', 'repeat', 'her', 'Olympic', 'double', 'at', 'this', 'weekend', '#39;s', 'World', 'Athletics', 'Final', 'after', 'clearing', 'the', 'first', 'hurdle', 'with', 'a', 'victory', 'in', 'the', '1500m', 'yesterday.']
2 ['NBA', 'suspends', 'nine', 'players,', 'Artest', 'for', 'rest', 'of', 'season'] ['NBA', 'on', 'Sunday', 'suspended', 'nine', 'players', 'for', 'involving', 'in', 'a', 'melee', 'during', 'Friday', '#39;s', 'game', 'between', 'Detorit', 'Pistons', 'and', 'Indiana', 'Pacers,', 'with', 'Ron', 'Artest', 'suspended', 'for', 'the', 'rest', 'of', 'the', 'season,', '73', 'games.']
2 ['On', 'the', 'Far', 'Side', 'of', 'the', 'Field,', 'a', 'Familiar', 'Face'] ['Perhaps', 'there', 'will', 'be', 'a', 'moment', 'during', "Sunday's", 'game', 'between', 'the', 'Giants', 'and', 'the', 'Redskins', 'when', 'a', 'coach', 'and', 'his', 'former', 'franchise', 'quarterback', 'will', 'do', 'a', 'double', 'take.']
3 ['', '#39;QUIET', '#39;', 'RULE', 'MAY', 'CHANGE'] ['The', 'Securities', 'and', 'Exchange', 'Commission', 'wants', 'to', 'scrap', 'a', '1933', 'rule', 'that', 'forces', 'a', 'strict', '', 'quot;quiet', 'period', 'quot;', 'on', 'all', 'talk', 'about', 'a', 'company', 'just', 'prior', 'to', 'its', 'stock', 'being', 'sold', 'initially', 'to', 'the', 'public.']
2 ['Denehy', 'boosts', 'Walpole', ''] ['Danvers', 'coach', 'thought', 'he', 'had', 'the', 'perfect', 'game', 'plan', 'against', 'Walpole', 'last', 'night', 'in', 'the', 'Division', '2', 'playoffs', 'at', 'Endicott', 'College.', 'It', 'was', 'the', 'same', 'game', 'plan', 'that', 'earned', 'his', 'team', 'its', 'first', 'playoff', 'berth', 'in', '63', 'years.']
The question is that what if I use build_vocab_from_iterator
to create iterator ?
Does the function has same meaning between my part using BucketIterator
?
Also, I think using Pretrained Word Embeddings GloVe
is better than FastText
in this work, because the model needs to classify the description is which types.
CodePudding user response:
After all, the solution which I just post can train the model.
And it had better to use stopwords from library to has better accuracy.