Home > OS >  k nearst neighbour using numpy
k nearst neighbour using numpy

Time:12-11

k nearest neighbour is useful algorithm to classify labels, and I have some queries about it.

If I have a train set(1000, 400), a test set(300, 400) and train labels (1000), how can I apply k nearest neighbour to find the right test lables? Also, by using numpy. Thank you!

CodePudding user response:

Assuming you have some basic understandment of numpy, this is some old code of mine for a KNN_classifier, anyway if you have any trouble using it or understanding it i can kindly explain it to you tommorow when i'll have some free time in case no one responds till then.

import numpy as np 
from sklearn.metrics import accuracy_score as accuracy


class Knn_classifier:
    
    def __init__(self, train_images, train_labels):
        self.train_images = train_images
        self.train_labels = train_labels
    
    def classify_image(self, test_image, num_neighbors=3, metric='l2'):
        if metric == 'l2':
            distances = np.sqrt(np.sum(
                np.square(self.train_images - test_image),
                axis = 1
            ))
            indexes = np.argsort(distances)
            indexes = indexes[:num_neighbors]
            labels = self.train_labels[indexes]
            label = np.argmax(np.bincount(labels))
        else:
            distances = np.sum(np.abs(self.train_images - test_image),axis = 1)
            indexes = np.argsort(distances)
            indexes = indexes[:num_neighbors]
            labels = self.train_labels[indexes]
            label = np.argmax(np.bincount(labels))

        return label 
    
    def classify_images(self, test_images, num_neighbors=3, metric='l2'):
        # write your code here
        labels = []
        for image in test_images:
            labels.append(self.classify_image(image,num_neighbors,metric))

        return labels
    
    def accuracy_score(self,predicted, ground_truth):
        return accuracy(predicted, ground_truth)*100


train_images = np.load('data/train_images.npy') # load training images
train_labels = np.load('data/train_labels.npy') # load training labels
test_images = np.load('data/test_images.npy') # load testing images
test_labels = np.load('data/test_labels.npy') # load testing labels


knn = Knn_classifier(train_images, train_labels)
predicted = knn.classify_images(test_images, metric='l1')
knn.accuracy_score(predicted,test_labels)
    

The shapes of my train.images and train_labels we're (1000, 784) and (1000,)

CodePudding user response:

Here's a fully vectorized solution.

import numpy as np

(N_train, N_test, N_feats, N_labels, k) = (1000, 300, 400, 20, 5)

train_X = np.random.rand(N_train, N_feats)
train_y = np.random.randint(N_labels, size=N_train)
test_X = np.random.rand(N_test, N_feats)

# See: https://jaykmody.com/blog/distance-matrices-with-numpy/.
test_X2 = np.sum(test_X**2, axis=1, keepdims=True)
train_X2 = np.sum(train_X**2, axis=1)
test_train_X = test_X @ train_X.T
sq_dists = test_X2 - 2 * test_train_X   train_X2

k_nearest_neighbors = np.argsort(sq_dists, axis=1)[:, :k]
k_labels = train_y[k_nearest_neighbors]
# See: https://stackoverflow.com/a/71812803/1316276.
k_labels_onehot = k_labels[..., None] == np.arange(N_labels)[None, None, :]
pred_y = np.argmax(np.count_nonzero(k_labels_onehot, axis=1), axis=-1)
  • Related