I'm trying to define a set of classes/labels for multilabel classification with classes 1,2,3 and 4, but are having issues with the array accidentally also includes the following:
multilabel.classes_ array([' ', ',', '1', '2', '3', '4'], dtype=object)
I only want to have 1,2,3,4 as my labels, and can't figure out a way to remove this.
My code:
import pandas as pd
import numpy as np
import os
import ast
import seaborn as sns #pip install seaborn
import matplotlib.pyplot as plt
import skmultilearn #pip install scikit-multilearn
from preprocessing.transcription_preprocessing import TranscriptionPreprocessor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
df = pd.read_csv(r'C:\Users\M94969\Desktop\datasets\prod500.csv')
# Define label variable
y = df['tags']
# Make multilabelbinarizer object
#multilabel = MultiLabelBinarizer()
#y = multilabel.fit_transform(y)
#multilabel.classes_
#pd.DataFrame(y,columns=multilabel.classes_)
labelbinarizer = LabelBinarizer()
fit = labelbinarizer.fit_transform(y)
labelbinarizer.classes_
pd.DataFrame(y,columns=labelbinarizer.classes_)
# Turn texts into sparse matrix
tfidf = TfidfVectorizer(analyzer='word', max_features=1000, max_df=0.50, ngram_range=(1,3))
X = tfidf.fit_transform(df['text'])
tfidf.vocabulary_
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Build models
sgd = SGDClassifier()
lr = LogisticRegression(solver = 'lbfgs')
svc = LinearSVC()
def j_score(y_true, y_pred):
jaccard = np.minimum(y_true, y_pred).sum(axis=1)/np.maximum(y_true, y_pred).sum(axis = 1)
return jaccard.mean()*100
def print_score(y_pred, clf):
print("Clf: ", clf.__class__.__name__)
print('Jacard score: {}'.format(j_score(y_test,y_pred)))
print('----')
for classifier in [sgd, lr, svc]:
clf = OneVsRestClassifier(classifier)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print_score(y_pred, classifier)
link to subset of data: https://www.filemail.com/d/whkmmsazgrwzfdp
CodePudding user response:
When you run:
df['tags'].unique()
on your sample data the output is as follows:
array(['1', '3', '2', '1, 2'], dtype=object)
The multitag assignment happens in row 7 of your dataframe:
df[df['tags']=='1, 2']
results in:
text TV Internet Mobil Fastnet tags
7 TIL YOUSEE... 1 2 0 0 1, 2
If you don't want this binarized you could simply remove the row or assign a single label in your dataframe.
Alternatively you can look at the sklearn LabelBinarizer to get labels more in line of what you are looking for:
labelbinarizer = LabelBinarizer()
fit = labelbinarizer.fit_transform(y)
labelbinarizer.classes_
# array(['1', '1, 2', '2', '3'], dtype='<U4')