How to use your trained xgb-model in r to apply it on a new dataset?-CodePudding

I trained a xgb model like this:

candidates_var_train <- model.matrix(job_change ~ 0   ., data = candidates_train)
candidates_train_xgb <- xgb.DMatrix(data = candidates_var_train, 
                                    label = ifelse(candidates_train$job_change == "Interested", 1, 0))

candidates_var_test <- model.matrix(job_change ~ 0   ., data = candidates_test)
candidates_test_xgb <- xgb.DMatrix(data = candidates_var_test, 
                                   label = ifelse(candidates_test$job_change == "Interested", 1, 0))

Got a decent AUC and want to apply it on my new data set. New data is saved as a data frame and has same columns as test/training data except for target variable "job_change". I tried to convert it into a sparse Matrix like this:

candidates_predict_sparse <- as(as.matrix(candidates_predict), "sparseMatrix")
candidates_predict_xgb <- xgb.DMatrix(data = candidates_predict_sparse)

But NAs were introduced in the sparse matrix and when I try to do a prediction using predict() following error occurs:

Error in predict.xgb.Booster(xgb_model, newdata = candidates_predict_sparse,  : 
  Feature names stored in `object` and `newdata` are different!

EDIT: Reproducible Example

minimal datasets:

candidates_predict (dataset I want to have the prediction for)

structure(list(enrollee_id = c(23427, 17605, 20912, 13948, 15205, 
15140, 21736, 19800, 23755, 12148), city_development_index = c(0.698, 
0.896, 0.754, 0.926, 0.92, 0.878, 0.926, 0.767, 0.689, 0.92), 
    gender = structure(c(4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L), levels = c("Female", "Male", "Other", "keine Angabe"
    ), class = "factor"), enrolled_university = structure(c(4L, 
    2L, 1L, 2L, 1L, 3L, 3L, 2L, 2L, 2L), levels = c("Full time course", 
    "no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
    company_size = structure(c(9L, 9L, 9L, 5L, 3L, 9L, 3L, 6L, 
    2L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
    "10000 ", "50-99", "500-999", "5000-9999", "keine Angabe"
    ), class = "factor"), company_type = structure(c(7L, 7L, 
    7L, 6L, 6L, 7L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup", 
    "Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
    "keine Angabe"), class = "factor"), last_new_job = structure(c(6L, 
    6L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 5L), levels = c("1", "2", 
    "3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
    training_hours = c(63, 10, 46, 18, 55, 4, 324, 26, 140, 158
    ), education_detail = structure(c(8L, 7L, 7L, 21L, 8L, 22L, 
    7L, 7L, 7L, 19L), levels = c("Graduate Arts", "Graduate Business Degree", 
    "Graduate Humanities", "Graduate No Major", "Graduate no major discipline", 
    "Graduate Other", "Graduate STEM", "High School", "keine Angabe", 
    "Masters Arts", "Masters Business Degree", "Masters Humanities", 
    "Masters No Major", "Masters no major discipline", "Masters Other", 
    "Masters STEM", "Phd Arts", "Phd Business Degree", "Phd Humanities", 
    "Phd Other", "Phd STEM", "Primary School"), class = "factor"), 
    experience_detail = structure(c(23L, 23L, 23L, 23L, 23L, 
    21L, 23L, 17L, 10L, 23L), levels = c("<1", ">20", "1", "10", 
    "11", "12", "13", "14", "15", "16", "17", "18", "19", "2", 
    "20", "3", "4", "5", "6", "7", "8", "9", "no relevant experience"
    ), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))

candidates_train (dataset I trained the xgboost model with)

structure(list(enrollee_id = c(26270, 3166, 20087, 8518, 8899, 
25403, 14514, 3300, 10364, 5220), city_development_index = c(0.92, 
0.887, 0.698, 0.92, 0.92, 0.92, 0.624, 0.84, 0.926, 0.754), gender = structure(c(1L, 
2L, 2L, 2L, 4L, 2L, 2L, 4L, 4L, 2L), levels = c("Female", "Male", 
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), levels = c("Full time course", 
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
    company_size = structure(c(7L, 9L, 1L, 9L, 9L, 3L, 9L, 2L, 
    5L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
    "10000 ", "50-99", "500-999", "5000-9999", "keine Angabe"
    ), class = "factor"), company_type = structure(c(2L, 7L, 
    2L, 7L, 7L, 6L, 7L, 6L, 4L, 7L), levels = c("Early Stage Startup", 
    "Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
    "keine Angabe"), class = "factor"), last_new_job = structure(c(3L, 
    1L, 1L, 1L, 6L, 1L, 6L, 3L, 5L, 4L), levels = c("1", "2", 
    "3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
    training_hours = c(127, 36, 7, 39, 53, 168, 111, 52, 107, 
    46), job_change = c("Interested", "Not interested", "Not interested", 
    "Not interested", "Not interested", "Not interested", "Not interested", 
    "Not interested", "Not interested", "Not interested"), education_detail = structure(c(3L, 
    7L, 16L, 22L, 22L, 3L, 8L, 7L, 8L, 6L), levels = c("Graduate Arts", 
    "Graduate Business Degree", "Graduate Humanities", "Graduate No Major", 
    "Graduate no major discipline", "Graduate Other", "Graduate STEM", 
    "High School", "keine Angabe", "Masters Arts", "Masters Business Degree", 
    "Masters Humanities", "Masters No Major", "Masters no major discipline", 
    "Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree", 
    "Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
    ), class = "factor"), experience_detail = structure(c(17L, 
    5L, 18L, 23L, 23L, 14L, 23L, 8L, 5L, 2L), levels = c("<1", 
    ">20", "1", "10", "11", "12", "13", "14", "15", "16", "17", 
    "18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9", 
    "no relevant experience"), class = "factor")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`505` = 505L, 
`688` = 688L, `1355` = 1355L, `1498` = 1498L, `1594` = 1594L, 
`3607` = 3607L, `4897` = 4897L, `5743` = 5743L, `5863` = 5863L, 
`5908` = 5908L, `6377` = 6377L, `7449` = 7449L, `7578` = 7578L
), class = "omit"))

candidates_test (dataset I tested the xgboost model with)

structure(list(enrollee_id = c(402, 27107, 8722, 6588, 4167, 
19061, 17139, 14928, 10164, 8612), city_development_index = c(0.762, 
0.92, 0.624, 0.926, 0.92, 0.926, 0.624, 0.92, 0.926, 0.92), gender = structure(c(2L, 
2L, 4L, 2L, 4L, 2L, 4L, 2L, 2L, 4L), levels = c("Female", "Male", 
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L, 
2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 2L), levels = c("Full time course", 
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
    company_size = structure(c(1L, 6L, 9L, 2L, 6L, 3L, 7L, 3L, 
    3L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
    "10000 ", "50-99", "500-999", "5000-9999", "keine Angabe"
    ), class = "factor"), company_type = structure(c(6L, 6L, 
    7L, 6L, 6L, 6L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup", 
    "Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
    "keine Angabe"), class = "factor"), last_new_job = structure(c(5L, 
    1L, 6L, 5L, 6L, 2L, 1L, 3L, 4L, 4L), levels = c("1", "2", 
    "3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
    training_hours = c(18, 46, 26, 18, 106, 50, 148, 40, 42, 
    50), job_change = c("Interested", "Interested", "Not interested", 
    "Not interested", "Not interested", "Not interested", "Interested", 
    "Not interested", "Interested", "Not interested"), education_detail = structure(c(7L, 
    7L, 8L, 7L, 7L, 16L, 7L, 7L, 21L, 7L), levels = c("Graduate Arts", 
    "Graduate Business Degree", "Graduate Humanities", "Graduate No Major", 
    "Graduate no major discipline", "Graduate Other", "Graduate STEM", 
    "High School", "keine Angabe", "Masters Arts", "Masters Business Degree", 
    "Masters Humanities", "Masters No Major", "Masters no major discipline", 
    "Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree", 
    "Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
    ), class = "factor"), experience_detail = structure(c(7L, 
    20L, 23L, 10L, 3L, 5L, 8L, 2L, 2L, 23L), levels = c("<1", 
    ">20", "1", "10", "11", "12", "13", "14", "15", "16", "17", 
    "18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9", 
    "no relevant experience"), class = "factor")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`531` = 531L, 
`615` = 615L, `715` = 715L, `1000` = 1000L, `1148` = 1148L, `1318` = 1318L, 
`1416` = 1416L), class = "omit"))

libraries used

library(Matrix)
library(xgboost)
library(dplyr)
library(readr)

CodePudding user response：

The xgboost model is trained on a dataset which has 73 features. That is due to that model.matrix expands factors to a set of dummy variables (one column for each unique entry in the dataset), but "candidates_predict_sparse" only has 10 as the features are not dummies.

> colnames(candidates_train)
 [1] "enrollee_id"            "city_development_index" "gender"                 "enrolled_university"    "company_size"          
 [6] "company_type"           "last_new_job"           "training_hours"         "job_change"             "education_detail"      
[11] "experience_detail"     

> colnames(candidates_var_train)
 [1] "enrollee_id"                                  "city_development_index"                      
 [3] "genderFemale"                                 "genderMale"                                  
 [5] "genderOther"                                  "genderkeine Angabe"                                                 
..... 
[69] "experience_detail6"                           "experience_detail7"                          
[71] "experience_detail8"                           "experience_detail9"                          
[73] "experience_detailno relevant experience"

> colnames(candidates_predict_sparse)
 [1] "enrollee_id"            "city_development_index" "gender"                 "enrolled_university"    "company_size"          
 [6] "company_type"           "last_new_job"           "training_hours"         "education_detail"       "experience_detail"

So you see. The xgboost model expects to predict on 73 features, but only gets 10. In order to make this work, the xgboost model requires the same number of features to predict as the model used in training. So you need to dummy the "candidates predict" matrix. Luckily, that's quite easy:

# arbitrary value to ensure model.matrix has a formula
candidates_predict$job_change <- 0
candidates_predict_dummied <- model.matrix(job_change ~ 0   ., data = candidates_predict)

# Now you have the same structure and you can use it to predict:
> predict(xgb_model, candidates_predict_dummied)
 [1]  0.3696896434  0.1225184500  0.0037288326 -0.0001312745 -0.1928645670 -0.0001312745 -0.2914776802  0.1280405670  0.3696896434
[10] -0.0001312745