One class SVM always returns FALSE-CodePudding

I have a project in my ML course about anomaly/novelty detection and decided to study the One-class SVM algorithm as described in this paper: http://research.microsoft.com/pubs/69731/tr-99-87.pdf. In the package e1071 in R there is an svm function that seems to support one-class classification. However, when I try to use it the predictor always returns false (even on the training set, which is the weirdest thing). Here is my code :

library(e1071) # for svm classifier
library(IMIFA) # for USPS dataset
library(caret) # for confusion matrices

data(USPSdigits)

digits.train <- USPSdigits$train
digits.train <- digits.train[order(digits.train$V1), ]
digits.train$is.zero[digits.train$V1 == 0] <- "TRUE"
digits.train$is.zero[digits.train$V1 != 0] <- "FALSE"
digits.test <- USPSdigits$test
digits.test <- digits.test[order(digits.test$V1), ]
digits.test$is.zero[digits.test$V1 == 0] <- "TRUE"
digits.test$is.zero[digits.test$V1 != 0] <- "FALSE"

digits.train.features <- digits.train[digits.train$V1 == 0, -c(1, 258)]
digits.train.labels <- digits.train[digits.train$V1 == 0, 258]
digits.train.nu <- 0.5
digits.train.bandwith <- 0.5*256
digits.train.model <- svm(x = digits.train.features, type = 'one-classification', kernel = 'radial', nu = digits.train.nu, gamma = digits.train.bandwith)
digits.train.fitted <- predict(digits.train.model, digits.train.features)
digits.train.confusionMatrix <- table(Predicted = digits.train.fitted, Reference = digits.train.labels)
print(digits.train.confusionMatrix)

digits.test.features <- subset(digits.test, select = -c(is.zero, V1))
digits.test.labels <- digits.test$is.zero
digits.test.fitted <- predict(digits.train.model, digits.test.features)
digits.test.confusionMatrix <- table(Predicted = digits.test.fitted, Reference = digits.test.labels)
print(digits.test.confusionMatrix)

and my output is :

> print(digits.train.confusionMatrix)
         Reference
Predicted TRUE
    FALSE 1194
> print(digits.test.confusionMatrix)
         Reference
Predicted FALSE TRUE
    FALSE  1648  359

What am I doing wrong?

CodePudding user response：

I created a working example. Adjust naming conventions to your own. I have used a very expressive naming convention to show exactly what I did.

Training on only 1 number results in a lot of columns with the same values. These return as errors in svm and should be removed. The function nearZeroVar from caret is an ideal function for this. If you ever use the recipes package, it is called step_nzv.

library(e1071)
# library(caret) # caret used for nearZeroVar function.

# data steps like OP's
digits.train <- USPSdigits$train
digits.test <- USPSdigits$test

digits.train$is.zero[digits.train$V1 == 0] <- "TRUE"
digits.train$is.zero[digits.train$V1 != 0] <- "FALSE"


digits.test$is.zero[digits.test$V1 == 0] <- "TRUE"
digits.test$is.zero[digits.test$V1 != 0] <- "FALSE"


train_the_positives <- subset(digits.train, is.zero == "TRUE")

# get the columns with 99% of the same values
cols_to_remove <- caret::nearZeroVar(train_the_positives, freqCut = 99/1)

svm.model <- svm(train_the_positives[, -cols_to_remove],
                 type = 'one-classification',
                 nu = 0.10,
                 kernel = "radial")

# predictions on train_the_positives set
svm_predictions_on_train_the_positives <- predict(svm.model, train_the_positives[, -cols_to_remove])
table(Predicted = svm_predictions_on_train_the_positives,
      Reference = train_the_positives$is.zero)

         Reference
Predicted TRUE
    FALSE  121
    TRUE  1073

# predictions on full train set
svm_prediction_on_full_train_set <- predict(svm.model, digits.train[, -cols_to_remove])
table(Predicted = svm_prediction_on_full_train_set,
      Reference = digits.train$is.zero)

         Reference
Predicted FALSE TRUE
    FALSE  6069  121
    TRUE     28 1073


# predictions on test set
svm_prediction_on_test_set <- predict(svm.model, digits.test[, -cols_to_remove])
table(Predicted = svm_prediction_on_test_set,
      Reference = digits.test$is.zero)

         Reference
Predicted FALSE TRUE
    FALSE  1638   68
    TRUE     10  291