I am working with the R programming language.
- In this problem, I have a dataset with 2 variables: Height and Salary
- I randomly split this dataset into 2 parts: train and test
For the train data:
- I first break the Salary variable into 3 quantiles (0.33, 0.66 and 0.99)
- For each of these Salary quantiles, I calculate the corresponding 80th quantile for the Height variable
For the test data:
- I organize the test data using the same Salary quantiles from the train data
- Using the 80th Height quantiles from the train data, I calculate what percent of rows are smaller than this 80th quantile.
- I then summarize all this information into a results table
I have posted my code below that corresponds to the above steps:
#PART 1
#create data
library(dplyr)
library(caret)
set.seed(123)
salary <- rnorm(1000,5,5)
height <- rnorm(1000,2,2)
my_data = data.frame(salary, height)
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 1
results$total_mean = mean(test$accuracy)
#END : view results
salary_type Mean iteration total_mean
1 A 0.7472527 1 0.7666667
2 B 0.8090909 1 0.7666667
3 C 0.7373737 1 0.7666667
Question : I want to transform this above procedure into a "k-fold cross validation", where I test how accurate the "80th quantile" is (i.e. on average, what percent of times are observations below the 80th height quantile for their corresponding salary group). This would look something like the image below:
I know how to do this manually. For example, here is the second "fold" (i.e. second iteration):
#Second Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 2
results$total_mean = mean(test$accuracy)
Here is the third fold:
#Third Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#etc etc etc
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 3
results$total_mean = mean(test$accuracy)
And this can be repeated "k" number of times (i.e. "k folds", "k iterations").
Problem I am trying to recreate a "loop" for this procedure. In the end, the following table would be produced (e.g. max iterations = 10) :
iteration_aka_fold_number salary_a_accuracy salary_b_accuracy salary_c_accuracy total_accuracy
1 1 79 77 75 79
2 2 77 79 80 79
3 3 78 79 71 79
What I tried so far : I tried to write the following loop to create the above table:
for (i in 1:10)
{
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_i = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_i$iteration = i
results_i$total_mean = mean(test_i$accuracy)
}
But this only keeps the last iteration:
> results
salary_type Mean iteration total_mean
1 A 0.7582418 10 0.7566667
2 B 0.7818182 10 0.7566667
3 C 0.7272727 10 0.7566667
Can someone please show me how to write this loop correctly?
Thanks
CodePudding user response:
Perhaps this would suit:
results <- list()
for (i in 1:10) {
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
results
results_df <- do.call(rbind.data.frame, results)
This loads your results into a list (called "results") which you can then reduce to a single dataframe ("results_df"). Does this solve your problem?
CodePudding user response:
The issue is that you are overwriting the result each time so you just get the final output in results_i
As the previous poster wrote, you should define an object before the loop, i.e.
results <- list() #output as list
And then store each output as an item within that list by modifying just the end of the loop so that each output is saved into the list, rather than overwriting the dataframe
for (i in 1:10){
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
This is just as done by the previous poster again. Then you can collapse the list into a dataframe with reduce()
& rbind()
results_df <- results %>% reduce(rbind)