Home > OS >  K-Nearest Neighbors program always reports same class value
K-Nearest Neighbors program always reports same class value

Time:11-02

I've written a short implementation of the KNN algorithm to determine a sample point {5.2,3.1}'s class according to a brief snippet of the iris dataset, however the class is always reported as 1 (Virginica). It is not immediately obvious to me where the issue arises in my code. Can someone please help me figure out where/why this occurs?

#include <iostream>
#include <math.h>
#include <string>

//Setosa = 0, Virginica = 1, Verscicolor = 2
//[0] and [1] = data point, [2] = class, [3] = distance
double train_data[15][3] = {
{5.3,3.7,0},{5.1,3.8,0},{7.2,3.0,1},
{5.4,3.4,0},{5.1,3.3,0},{5.4,3.9,0},
{7.4,2.8,1},{6.1,2.8,2},{7.3,2.9,1},
{6.0,2.7,2},{5.8,2.8,1},{6.3,2.3,2},
{5.1,2.5,2},{6.3,2.5,2},{5.5,2.4,2}
};

double Distance(double attr1, double attr2, double sAttr1, double sAttr2)
        {
            return sqrt(pow(attr1-sAttr1, 2.0) pow(attr2-sAttr2, 2.0));
        }

int findMaxIndex(float *classes)
    {
        int maxIndex = 0;
        for (int i = 0; i < 3; i  ){
            if (classes[i] > classes[maxIndex])
            maxIndex = i;
        }
        return 0;
    }

 int main(){

    for(int i = 0; i < 15; i  ){
                train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);
    }

    for(int i = 0; i < 15; i  ){
                for (int j = i 1; j < 15; j  ){

                    if (train_data[i][3] < train_data[j][3]){
                        //swap
                        for(int k = 0; k < 4; k  ){
                            double temp = train_data[i][k];
                            train_data[i][k] = train_data[j][k];
                            train_data[j][k] = temp;
                        }
                    }
                }
    }   

    //Based on a value for k determine the class
            int K = 5;
            float *classes = new float[3];
            for (int i =0; i < 3; i  ){
                classes[i] = 0;
            }
            for (int i = 0 ; i < K; i  )
            {
                classes[(int)train_data[i][2]-1]  ;
            }
            
        int predictedLabel = findMaxIndex(classes) 1;
        std::cout << "Predicted class for point {5.2,3.1} is: " << predictedLabel << std::endl;
        return 0;
}

CodePudding user response:

If you enable warnings, you'll see that

test.cpp|33 col 32| warning: array subscript 3 is above array bounds of ‘double [3]’ [-Warray-bounds]
||    33 |                 train_data[i][3] = Distance(train_data[i][0],train_data[i][1],5.2,3.1);

Array indexes start at 0.

Later on, you also subtract (class - 1) to index the classes array. Oops. That was already zero-based, so it will become negative.

Consider avoiding the whole source of errors:

struct {
    double x, y;
    int _class = 0;
    double distance = 0;
} train_data[] = {
    {5.3, 3.7, 0}, {5.1, 3.8, 0}, {7.2, 3.0, 1}, //
    {5.4, 3.4, 0}, {5.1, 3.3, 0}, {5.4, 3.9, 0}, //
    {7.4, 2.8, 1}, {6.1, 2.8, 2}, {7.3, 2.9, 1}, //
    {6.0, 2.7, 2}, {5.8, 2.8, 1}, {6.3, 2.3, 2}, //
    {5.1, 2.5, 2}, {6.3, 2.5, 2}, {5.5, 2.4, 2}  //
};

for(auto& node : train_data) {
    node.distance = Distance(node.x, node.y, 5.2, 3.1);
}

Also note that std::swap(node1, node2) will just work. Or make the node type sortable.

More Idiomatic C

Here's my take

Live On Coliru

#include <array>
#include <iostream>
#include <cmath>
#include <string>
#include <vector>

//Setosa = 0, Virginica = 1, Verscicolor = 2
enum Class { Setosa, Virginica, Verscicolor, NCLASSES };

struct node {
    double x, y;
    Class  _class   = Setosa;
    double distance = 0;

    bool operator<(node const& other) const {
        return distance > other.distance;
    }
};

std::vector<node> train_data{
    {5.3, 3.7, Setosa},      {5.1, 3.8, Setosa},      {7.2, 3.0, Virginica},   //
    {5.4, 3.4, Setosa},      {5.1, 3.3, Setosa},      {5.4, 3.9, Setosa},      //
    {7.4, 2.8, Virginica},   {6.1, 2.8, Verscicolor}, {7.3, 2.9, Virginica},   //
    {6.0, 2.7, Verscicolor}, {5.8, 2.8, Virginica},   {6.3, 2.3, Verscicolor}, //
    {5.1, 2.5, Verscicolor}, {6.3, 2.5, Verscicolor}, {5.5, 2.4, Verscicolor}  //
};

double Distance(double x1, double y1, double x2, double y2) {
    return sqrt(pow(x1 - x2, 2)   pow(y1 - y2, 2));
}

// Based on a value for k determine the class
Class predict(double x, double y, unsigned K) {
    for (auto& node : train_data) {
        node.distance = Distance(node.x, node.y, x, y);
    }
    sort(train_data.begin(), train_data.end());

    // tally buckets:
    std::array<unsigned, NCLASSES> classes = {0, 0, 0};

    for (unsigned i = 0; i < K; i  ) {
        classes[train_data[i]._class]  ;
    }

    auto index = std::max_element(classes.begin(), classes.end()) //
        - classes.begin();
    return static_cast<Class>(index);
}

int main()
{
    std::cout << "Predicted class for point {5.2,3.1} is: "
              << predict(5.2, 3.1, 5) << "\n";
}

Prints

Predicted class for point {5.2,3.1} is: 1

I do suspect you had the sorting order inverted? With the sort order flipped:

Live On Coliru

Predicted class for point {5.2,3.1} is: 0
  • Related