I have look into this problem and some people suggest that changing column name might work. But I can't seems to figure out which column is causing the issue.
my code
library(Amelia)
library(corrplot)
library(GGally)
library(caret)
data <- asianmen_100.free
summary(data)
#remove unwated variables
reject_vars <- names(data) %in% c("firstname","lastname","country","Event","Pool.Length","Competition",
"Comp.Country","name","DOB","Date","mins","secs","minsAsSecDuration","earliest_date",
"Final_Medal","Time","secsAsDuration")
data.new <- data[!reject_vars]
data.new$Age. <- as.numeric(data.new$Age.)
#Remove Target variables
remove_vars <- names(data.new) %in% c("oly_success")
data.new <- data.new[!remove_vars]
ggcorr(data.new, label = TRUE)
# find variables that have higher cross-correlation
M <- data.matrix(data.new)
corrM <- cor(M)
highlyCorrM <- findCorrelation(corrM, cutoff=0.5)
names(data.new)[highlyCorrM]
#sample size
smp_size <- floor(2/3 * nrow(data.new))
set.seed(2)
#sample dataset
data.new <- data.new[sample(nrow(data.new)), ]
data.train <- data.new[1:smp_size, ]
data.test <- data.new[(smp_size 1):nrow(data.new), ]
#model building
formula = oly_success ~ .
rmodel <- glm(formula = formula,
data=data.train,
family=binomial(link="logit"))
summary(rmodel)
This is the data :
> head(data.new)
# A tibble: 6 x 8
Age. timeAsDuration Success oly_success first_appear.age first_oly.age age_diff total_medal
<dbl> <Duration> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20 49.37s 0 0 17 NA NA 1
2 21 49.8s 0 0 21 NA NA 0
3 16 57.75s 0 0 16 NA NA 0
4 20 51.42s 0 0 17 NA NA 0
5 21 51.01s 0 0 16 NA NA 2
6 NA 54.11s 0 0 NA NA NA 0
Sample data
> dput(data.new[1:10,])
structure(list(Age. = c(20, 21, 16, 20, 21, NA, 19, 25, 26, 24
), timeAsDuration = new("Duration", .Data = c(49.37, 49.8, 57.75,
51.42, 51.01, 54.11, 50.88, 57.69, 51.49, 49.97)), Success = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), oly_success = c(0, 0, 0, 0, 0, 0,
0, 1, 0, 0), first_appear.age = c(17, 21, 16, 17, 16, NA, 19,
25, 25, 23), first_oly.age = c(NA, NA, NA, NA, NA, NA, NA, 26,
NA, NA), age_diff = c(NA, NA, NA, NA, NA, NA, NA, 1, NA, NA),
total_medal = c(1, 0, 0, 0, 2, 0, 0, 0, 0, 1)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
I have tried changing some of the column name and event the target variables name such to oly.success and still no success, where am I wrong?
CodePudding user response:
First of all in your dput(data.new)
the target variable is called oly_success
and in the formula, you use oly.success
, second you remove the target variable with:
#Remove Target variables
remove_vars <- names(data.new) %in% c("oly_success")
data.new <- data.new[!remove_vars]
if you fix these errors your code works well:
library(Amelia)
library(corrplot)
library(GGally)
library(caret)
ggcorr(data.new, label = TRUE)
# find variables that have higher cross-correlation
M <- data.matrix(data.new)
corrM <- cor(M)
highlyCorrM <- findCorrelation(corrM, cutoff=0.5)
names(data.new)[highlyCorrM]
#sample size
smp_size <- floor(2/3 * nrow(data.new))
set.seed(2)
#sample dataset
data.new <- data.new[sample(nrow(data.new)), ]
data.train <- data.new[1:smp_size, ]
data.test <- data.new[(smp_size 1):nrow(data.new), ]
#model building
rmodel <- glm(formula = oly_success ~ .,
data=data.new, #I use the entire dataset because the training one does not have all the levels for the logistic regression, since the example dataset is too small
family=binomial(link="logit"))
summary(rmodel)