Home > Software design >  confusionMatrix for knn classification in R
confusionMatrix for knn classification in R

Time:02-11

I wanted to use the optimal k value to conduct kNN clasification and predict the dependent variable diabetes in test set using train set and compare the results with the real values.

I've already got optimal k value and got the accruacy already. After that, I wanted to compare the results with the real value with using confussionMatrix but I got the problem with the different length.

I've already checked that nrow and length quantities are same(with 74) but it still have same problem.

Could you help me to overcome this problem?

My codes are as like below

install.packages("mlbench")
install.packages("gbm")

library(mlbench)
library(gbm)

data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)

MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)

any(is.na(MLdata))
sum(is.na(MLdata))

MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)

set.seed(3333)

MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)

MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]

head(MLTrain)
head(MLValid)
head(MLTest)

str(MLTrain)
str(MLValid)
str(MLTest)

View(MLTestY)


MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]

MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])

View(MLTrainX)
View(MLTrainY)

library(caret)

NormValues <- preProcess(MLTrainX, method = c("center", "scale"))

TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)

head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)


install.packages('FNN')
library(FNN)
library(class)

set.seed(3333)

NN <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 3)

NN

Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy3

nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')

set.seed(3333)

AccuracyK <- NULL

for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
             test = ValidXNormDF,
             cl = MLTrainY$`MLTrain[, 9]`,
             k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))


ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)

min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])

plot(formula = accuracy ~ k,
 data = ValidK,
 type = "o",
 pch = 5,
 main = "Optimal K Validation")

with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))

set.seed(3333)

NN120 <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 120)

Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy120

set.seed(3333)

FinalNN <- knn(train = TrainXNormDF, 
           test = TestXNormDF,
           cl = MLTrainY$`MLTrain[, 9]`,
           k = 120)

AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

AccuracyFinal

And here I got the problem.

Result <- confusionMatrix(FinalNN, TestXNormDF)

CodePudding user response:

I think you are looking to this:

Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  49  22
       pos   0   3
                                          
               Accuracy : 0.7027          
                 95% CI : (0.5852, 0.8034)
    No Information Rate : 0.6622          
    P-Value [Acc > NIR] : 0.2724          
                                          
                  Kappa : 0.153           
                                          
 Mcnemar's Test P-Value : 7.562e-06       
                                          
            Sensitivity : 1.0000          
            Specificity : 0.1200          
         Pos Pred Value : 0.6901          
         Neg Pred Value : 1.0000          
             Prevalence : 0.6622          
         Detection Rate : 0.6622          
   Detection Prevalence : 0.9595          
      Balanced Accuracy : 0.5600          
                                          
       'Positive' Class : neg    

Besides that, I recommend to fix the brackets on the for and change some code. A good refactoring would help !

CodePudding user response:

AugPelle has answered your question on how to get a confusion matrix.

I just wanted to point out that you're calculating accuracy incorrectly throughout your code. You should calculate accuracy based on the validation set or test set, not on the training set. This is why you're getting warnings, and it's giving you incorrect answers with a non-optimal k.

Below is the corrected code:

install.packages("mlbench")
install.packages("gbm")

library(mlbench)
library(gbm)

data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)

MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)

any(is.na(MLdata))
sum(is.na(MLdata))

MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)

set.seed(3333)

MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)

MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]

head(MLTrain)
head(MLValid)
head(MLTest)

str(MLTrain)
str(MLValid)
str(MLTest)

View(MLTestY)


MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]

MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])

View(MLTrainX)
View(MLTrainY)

library(caret)

NormValues <- preProcess(MLTrainX, method = c("center", "scale"))

TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)

head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)


install.packages('FNN')
library(FNN)
library(class)

set.seed(3333)

NN <- knn(train = TrainXNormDF, 
          test = ValidXNormDF,
          cl = MLTrainY$`MLTrain[, 9]`,
          k = 3)

NN

Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)

Accuracy3

nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')

set.seed(3333)

AccuracyK <- NULL

for(kk in c(1:nrow(TrainXNormDF))){
  Knn_K <- knn(train = TrainXNormDF,
               test = ValidXNormDF,
               cl = MLTrainY$`MLTrain[, 9]`,
               k = kk)
  AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}

  
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)

min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])

plot(formula = accuracy ~ k,
     data = ValidK,
     type = "o",
     pch = 5,
     main = "Optimal K Validation")

with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))

set.seed(3333)

NN120 <- knn(train = TrainXNormDF, 
             test = ValidXNormDF,
             cl = MLTrainY$`MLTrain[, 9]`,
             k = 36)

Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)

Accuracy36

set.seed(3333)

FinalNN <- knn(train = TrainXNormDF, 
               test = TestXNormDF,
               cl = MLTrainY$`MLTrain[, 9]`,
               k = 36)

AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)

AccuracyFinal

Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
  • Related