I am very new to Machine Learning and I would like to get a percentage returned for an individual array that I pass in the prediction model I have created.
I'm not sure how to go about getting the match percentage. I thought it was metrics.accuracy_score(Ytest, y_pred)
but when I try that it gives me the following error:
**ValueError: Found input variables with inconsistent numbers of samples: [4, 1]**
I have no idea if this is the correct way to go about this.
import numpy as np #linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #For Visualisation
import seaborn as sns #For better Visualisation
from bs4 import BeautifulSoup #For Text Parsing
import mysql.connector
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import docx2txt
import re
import csv
from sklearn import metrics
class Machine:
TrainData = ''
def __init__(self):
self.TrainData = self.GetTrain()
Data = self.ProcessData()
x = Data[0]
y = Data[1]
x, x_test, y, y_test = train_test_split(x,y, stratify = y, test_size = 0.25, random_state = 42)
self.Predict(x,y, '',x_test , y_test )
def Predict(self,X,Y,Data, Xtext, Ytest):
model = GaussianNB()
model.fit(Xtext, Ytest)
y_pred = model.predict([[1.0, 2.00613, 2, 5]])
print("Accuracy:",metrics.accuracy_score(Ytest, y_pred))
def ProcessData(self):
X = []
Y = []
i = 0
for I in self.TrainData:
Y.append(I[4])
X.append(I)
i = i 1
i = 0
for j in X:
X[i][0] = float(X[i][0])
X[i][1] = float(X[i][1])
X[i][2] = int(X[i][2])
X[i][3] = int(X[i][3])
del X[i][4]
i = i 1
return X,Y
def GetTrain(self):
file = open('docs/training/TI_Training.csv')
csvreader = csv.reader(file)
header = []
header = next(csvreader)
rows = []
for row in csvreader:
rows.append(row)
file.close()
return rows
Machine()
CodePudding user response:
The error is pretty clear: YTest
has 4 samples, and y_pred
only has one. You need an equal number of samples in each to get any metrics. I suspect you instead want to do
y_pred = model.predict(Xtext)
in your Predict
function.