Why does knn always predict the same number? How can I solve this? The dataset is here.
Code:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import scipy.io
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def load_mat_data(path):
mat = scipy.io.loadmat(DATA_PATH)
x,y = mat['data'], mat['class']
x = x.astype('float32')
# stadardize values
standardizer = preprocessing.StandardScaler()
x = standardizer.fit_transform(x)
return x, standardizer, y
def numpyToTensor(x):
x_train = torch.from_numpy(x)
return x_train
class DataBuilder(Dataset):
def __init__(self, path):
self.x, self.standardizer, self.y = load_mat_data(DATA_PATH)
self.x = numpyToTensor(self.x)
self.len=self.x.shape[0]
self.y = numpyToTensor(self.y)
def __getitem__(self,index):
return (self.x[index], self.y[index])
def __len__(self):
return self.len
datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']
for DATA_PATH in datasets:
print(DATA_PATH)
data_set=DataBuilder(DATA_PATH)
pred_rpknn = [0] * len(data_set.y)
kf = KFold(n_splits=10, shuffle = True, random_state=7)
for train_index, test_index in kf.split(data_set.x):
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
#print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = data_set.x[train_index], data_set.x[test_index]
y_train, y_test = data_set.y[train_index], data_set.y[test_index]
#Train the model using the training sets
y1_train = y_train.ravel()
knn.fit(x_train, y1_train)
#Predict the response for test dataset
y_pred = knn.predict(x_test)
#print(y_pred)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
c = 0
for idx in test_index:
pred_rpknn[idx] = y_pred[c]
c =1
print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
print(pred_rpknn, data_set.y.reshape(1,-1))
Output:
/home/katerina/Desktop/datasets/GSE75110.mat
Accuracy: 0.2857142857142857
Accuracy: 0.38095238095238093
Accuracy: 0.14285714285714285
Accuracy: 0.4
Accuracy: 0.3
Accuracy: 0.25
Accuracy: 0.3
Accuracy: 0.6
Accuracy: 0.25
Accuracy: 0.45
Accuracy: 0.33497536945812806
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
I am trying to combine knn with k fold in order to test the whole dataset using 10 folds. The problem is that knn always predicts arrays of 3's for each fold. The classes I want to predict are these:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]]
CodePudding user response:
TL;DR
It have to do with the StandardScaler
, change it to a simple normalisation.
e.g.
from sklearn import preprocessing
...
x = preprocessing.normalize(x)
Explanation:
Standard Scalar as you use it will do:
The standard score of a sample `x` is calculated as: z = (x - u) / s where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`.
When you actually want this features to help KNN to decide which vector is closer.
in normalize the normalization happen for each vector separately so it doesn't effect and even help the KNN to differentiate the vectors
With KNN StandardScaler
can actually harm your prediction. It is better to use it in other forms of data.
import scipy.io
from torch.utils.data import Dataset
from sklearn import preprocessing
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
def load_mat_data(path):
mat = scipy.io.loadmat(DATA_PATH)
x, y = mat['data'], mat['class']
x = x.astype('float32')
# stadardize values
x = preprocessing.normalize(x)
return x, y
def numpyToTensor(x):
x_train = torch.from_numpy(x)
return x_train
class DataBuilder(Dataset):
def __init__(self, path):
self.x, self.y = load_mat_data(DATA_PATH)
self.x = numpyToTensor(self.x)
self.len=self.x.shape[0]
self.y = numpyToTensor(self.y)
def __getitem__(self,index):
return (self.x[index], self.y[index])
def __len__(self):
return self.len
datasets = ['/home/katerina/Desktop/datasets/GSE75110.mat']
for DATA_PATH in datasets:
print(DATA_PATH)
data_set=DataBuilder(DATA_PATH)
pred_rpknn = [0] * len(data_set.y)
kf = KFold(n_splits=10, shuffle = True, random_state=7)
for train_index, test_index in kf.split(data_set.x):
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
#print("TRAIN:", train_index, "TEST:", test_index)
x_train, x_test = data_set.x[train_index], data_set.x[test_index]
y_train, y_test = data_set.y[train_index], data_set.y[test_index]
#Train the model using the training sets
y1_train = y_train.view(-1)
knn.fit(x_train, y1_train)
#Predict the response for test dataset
y_pred = knn.predict(x_test)
#print(y_pred)
# Model Accuracy, how often is the classifier correct?
print("Accuracy in loop:", metrics.accuracy_score(y_test, y_pred))
c = 0
for idx in test_index:
pred_rpknn[idx] = y_pred[c]
c =1
print("Accuracy:",metrics.accuracy_score(data_set.y, pred_rpknn))
print(pred_rpknn, data_set.y.reshape(1,-1))
Accuracy in loop: 1.0
Accuracy in loop: 0.8571428571428571
Accuracy in loop: 0.8571428571428571
Accuracy in loop: 1.0
Accuracy in loop: 0.9
Accuracy in loop: 0.9
Accuracy in loop: 0.95
Accuracy in loop: 1.0
Accuracy in loop: 0.9
Accuracy in loop: 1.0
Accuracy: 0.9359605911330049