I wanted to use the optimal k value to conduct kNN clasification and predict the dependent variable diabetes in test set using train set and compare the results with the real values.
I've already got optimal k value and got the accruacy already. After that, I wanted to compare the results with the real value with using confussionMatrix but I got the problem with the different length.
I've already checked that nrow and length quantities are same(with 74) but it still have same problem.
Could you help me to overcome this problem?
My codes are as like below
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy120
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
AccuracyFinal
And here I got the problem.
Result <- confusionMatrix(FinalNN, TestXNormDF)
CodePudding user response:
I think you are looking to this:
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 49 22
pos 0 3
Accuracy : 0.7027
95% CI : (0.5852, 0.8034)
No Information Rate : 0.6622
P-Value [Acc > NIR] : 0.2724
Kappa : 0.153
Mcnemar's Test P-Value : 7.562e-06
Sensitivity : 1.0000
Specificity : 0.1200
Pos Pred Value : 0.6901
Neg Pred Value : 1.0000
Prevalence : 0.6622
Detection Rate : 0.6622
Detection Prevalence : 0.9595
Balanced Accuracy : 0.5600
'Positive' Class : neg
Besides that, I recommend to fix the brackets on the for and change some code. A good refactoring would help !
CodePudding user response:
AugPelle has answered your question on how to get a confusion matrix.
I just wanted to point out that you're calculating accuracy incorrectly throughout your code. You should calculate accuracy based on the validation set or test set, not on the training set. This is why you're getting warnings, and it's giving you incorrect answers with a non-optimal k.
Below is the corrected code:
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy36
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)
AccuracyFinal
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )