Error in eval(predvars, data, env) : object 'oly.success' in Regression model-CodePudding

I have look into this problem and some people suggest that changing column name might work. But I can't seems to figure out which column is causing the issue.

my code

library(Amelia)
library(corrplot)
library(GGally)
library(caret)

data <- asianmen_100.free
summary(data)

#remove unwated variables
reject_vars <- names(data) %in% c("firstname","lastname","country","Event","Pool.Length","Competition",
                                  "Comp.Country","name","DOB","Date","mins","secs","minsAsSecDuration","earliest_date",
                                  "Final_Medal","Time","secsAsDuration")

data.new <- data[!reject_vars]
data.new$Age. <- as.numeric(data.new$Age.)


#Remove Target variables
remove_vars <- names(data.new) %in% c("oly_success") 
data.new <- data.new[!remove_vars]


ggcorr(data.new, label = TRUE)


# find variables that have higher cross-correlation
M <- data.matrix(data.new)
corrM <- cor(M)
highlyCorrM <- findCorrelation(corrM, cutoff=0.5)
names(data.new)[highlyCorrM]


#sample size
smp_size <- floor(2/3 * nrow(data.new)) 
set.seed(2)


#sample dataset
data.new <- data.new[sample(nrow(data.new)), ]
data.train <- data.new[1:smp_size, ]
data.test <- data.new[(smp_size 1):nrow(data.new), ]


#model building

formula = oly_success ~ .

rmodel <-  glm(formula = formula, 
               data=data.train, 
               family=binomial(link="logit")) 
  
summary(rmodel)

This is the data :

> head(data.new)
# A tibble: 6 x 8
   Age. timeAsDuration Success oly_success first_appear.age first_oly.age age_diff total_medal
  <dbl> <Duration>       <dbl>       <dbl>            <dbl>         <dbl>    <dbl>       <dbl>
1    20 49.37s               0           0               17            NA       NA           1
2    21 49.8s                0           0               21            NA       NA           0
3    16 57.75s               0           0               16            NA       NA           0
4    20 51.42s               0           0               17            NA       NA           0
5    21 51.01s               0           0               16            NA       NA           2
6    NA 54.11s               0           0               NA            NA       NA           0

Sample data

> dput(data.new[1:10,])
structure(list(Age. = c(20, 21, 16, 20, 21, NA, 19, 25, 26, 24
), timeAsDuration = new("Duration", .Data = c(49.37, 49.8, 57.75, 
51.42, 51.01, 54.11, 50.88, 57.69, 51.49, 49.97)), Success = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), oly_success = c(0, 0, 0, 0, 0, 0, 
0, 1, 0, 0), first_appear.age = c(17, 21, 16, 17, 16, NA, 19, 
25, 25, 23), first_oly.age = c(NA, NA, NA, NA, NA, NA, NA, 26, 
NA, NA), age_diff = c(NA, NA, NA, NA, NA, NA, NA, 1, NA, NA), 
    total_medal = c(1, 0, 0, 0, 2, 0, 0, 0, 0, 1)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

I have tried changing some of the column name and event the target variables name such to oly.success and still no success, where am I wrong?

CodePudding user response：

First of all in your dput(data.new) the target variable is called oly_success and in the formula, you use oly.success, second you remove the target variable with:

#Remove Target variables
remove_vars <- names(data.new) %in% c("oly_success") 
data.new <- data.new[!remove_vars]

if you fix these errors your code works well:

library(Amelia)
library(corrplot)
library(GGally)
library(caret)
   
ggcorr(data.new, label = TRUE)


# find variables that have higher cross-correlation
M <- data.matrix(data.new)
corrM <- cor(M)
highlyCorrM <- findCorrelation(corrM, cutoff=0.5)
names(data.new)[highlyCorrM]


#sample size
smp_size <- floor(2/3 * nrow(data.new)) 
set.seed(2)


#sample dataset
data.new <- data.new[sample(nrow(data.new)), ]
data.train <- data.new[1:smp_size, ]
data.test <- data.new[(smp_size 1):nrow(data.new), ]


#model building
rmodel <-  glm(formula = oly_success ~ ., 
               data=data.new, #I use the entire dataset because the training one does not have all the levels for the logistic regression, since the example dataset is too small
               family=binomial(link="logit")) 

summary(rmodel)