R: Manually Looping Functions-CodePudding

I am working with the R programming language.

In this problem, I have a dataset with 2 variables: Height and Salary
I randomly split this dataset into 2 parts: train and test

For the train data:

I first break the Salary variable into 3 quantiles (0.33, 0.66 and 0.99)
For each of these Salary quantiles, I calculate the corresponding 80th quantile for the Height variable

For the test data:

I organize the test data using the same Salary quantiles from the train data
Using the 80th Height quantiles from the train data, I calculate what percent of rows are smaller than this 80th quantile.
I then summarize all this information into a results table

I have posted my code below that corresponds to the above steps:

#PART 1
#create data

library(dplyr)
library(caret)
set.seed(123)



salary <- rnorm(1000,5,5)
height <- rnorm(1000,2,2)
my_data = data.frame(salary, height)


#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))




#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary  >  salary_quantiles$quant_1  & train$salary < salary_quantiles$quant_2, "B", "C")))

#PART 5
height_quantiles = data.frame( train %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))



#PART 6
#test

test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary  >  salary_quantiles$quant_1  & test$salary < salary_quantiles$quant_2, "B", "C")))

  test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]

test$accuracy = ifelse(test$height_pred > test$height, 1, 0)

#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 1

results$total_mean = mean(test$accuracy)

#END : view results

 salary_type      Mean iteration total_mean
1           A 0.7472527         1  0.7666667
2           B 0.8090909         1  0.7666667
3           C 0.7373737         1  0.7666667

Question : I want to transform this above procedure into a "k-fold cross validation", where I test how accurate the "80th quantile" is (i.e. on average, what percent of times are observations below the 80th height quantile for their corresponding salary group). This would look something like the image below:

I know how to do this manually. For example, here is the second "fold" (i.e. second iteration):

#Second Fold
#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))




#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary  >  salary_quantiles$quant_1  & train$salary < salary_quantiles$quant_2, "B", "C")))

#PART 5
height_quantiles = data.frame( train %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))



#PART 6
#test

test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary  >  salary_quantiles$quant_1  & test$salary < salary_quantiles$quant_2, "B", "C")))

  test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]

test$accuracy = ifelse(test$height_pred > test$height, 1, 0)

#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 2

results$total_mean = mean(test$accuracy)

Here is the third fold:

#Third Fold 
#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#etc etc etc


#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 3

results$total_mean = mean(test$accuracy)

And this can be repeated "k" number of times (i.e. "k folds", "k iterations").

Problem I am trying to recreate a "loop" for this procedure. In the end, the following table would be produced (e.g. max iterations = 10) :

     iteration_aka_fold_number salary_a_accuracy salary_b_accuracy salary_c_accuracy total_accuracy
1                         1                79                77                75             79
2                         2                77                79                80             79
3                         3                78                79                71             79

What I tried so far : I tried to write the following loop to create the above table:

for (i in 1:10)
    
{
    
    
    #PART 2
    #create train_i and test_i data
    
    
    train_i<-sample_frac(my_data, 0.7)
    sid<-as.numeric(rownames(train_i)) 
    test_i<-my_data[-sid,]
    
    #PART 3
    salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                          quant_2 = quantile(salary, 0.66),
                                                          quant_3 = quantile(salary, 0.99)))
    
    
    
    
    #PART 4
    train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
    
    #PART 5
    height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
    
    
    
    #PART 6
    #test_i
    
    test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
    
    test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
    
    test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
    
    #PART 7 : Results Frame
    
    results_i = data.frame(test_i %>%
                               group_by(salary_type) %>%
                               dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
    
    results_i$iteration = i
    
    results_i$total_mean = mean(test_i$accuracy)
    
}

But this only keeps the last iteration:

> results
  salary_type      Mean iteration total_mean
1           A 0.7582418        10  0.7566667
2           B 0.7818182        10  0.7566667
3           C 0.7272727        10  0.7566667

Can someone please show me how to write this loop correctly?

Thanks

CodePudding user response：

Perhaps this would suit:

results <- list()
for (i in 1:10) {
  train_i<-sample_frac(my_data, 0.7)
  sid<-as.numeric(rownames(train_i)) 
  test_i<-my_data[-sid,]
  
  salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                        quant_2 = quantile(salary, 0.66),
                                                        quant_3 = quantile(salary, 0.99)))
  
  train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
  test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
  test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
  test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
  
  #PART 7 : Results Frame
  
  results_tmp = data.frame(test_i %>%
                           group_by(salary_type) %>%
                           dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
  
  results_tmp$iteration = i
  
  results_tmp$total_mean = mean(test_i$accuracy)
  results[[i]] <- results_tmp
}
results
results_df <- do.call(rbind.data.frame, results)

This loads your results into a list (called "results") which you can then reduce to a single dataframe ("results_df"). Does this solve your problem?

CodePudding user response：

The issue is that you are overwriting the result each time so you just get the final output in results_i

As the previous poster wrote, you should define an object before the loop, i.e.

results <- list() #output as list

And then store each output as an item within that list by modifying just the end of the loop so that each output is saved into the list, rather than overwriting the dataframe

 for (i in 1:10){
  
  
  #PART 2
  #create train_i and test_i data
  
  
  train_i<-sample_frac(my_data, 0.7)
  sid<-as.numeric(rownames(train_i)) 
  test_i<-my_data[-sid,]
  
  #PART 3
  salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                        quant_2 = quantile(salary, 0.66),
                                                        quant_3 = quantile(salary, 0.99)))
  
  
  
  
  #PART 4
  train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  #PART 5
  height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
  
  
  
  #PART 6
  #test_i
  
  test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
  
  test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
  
  #PART 7 : Results Frame
  
  results_tmp = data.frame(test_i %>%
                             group_by(salary_type) %>%
                             dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
  
  results_tmp$iteration = i
  
  results_tmp$total_mean = mean(test_i$accuracy)
  
  results[[i]] <- results_tmp
    
  
}

This is just as done by the previous poster again. Then you can collapse the list into a dataframe with reduce() & rbind()

results_df <- results %>% reduce(rbind)