Home > Back-end >  How can I make this R function more efficient?
How can I make this R function more efficient?

Time:02-10

I don't understand how is it possible to make this code more efficient without parallelizing the problem. Could I combine both loops or substitute the first loop with easy to understand functions.

a_function <- function(){
  
#Store initial memory state



tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
  
  set.seed(150)
  dir_data <- file.path("data")
  
# Create the folder (already there)



# dir.create(dir_data, showWarnings = FALSE)
  
  random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
  
# For every row in random_data, get some more data and write it to a file in dir



for(i in 1:nrow(random_data)){
    ## make the file name to write to
    file_name <- file.path(dir_data,paste0("file",i,".csv"))
    
    set.seed(150)
    extra_random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
    
    write.table(extra_random_data, file = file_name, sep=";", row.names = FALSE)
  }
    
# For every row of x, read the data from the files and add 42 to the data you read and append it to the x data.frame
  


for(i in 1:nrow(random_data)){
    
# Construct the file name



file_name <- file.path(dir_data,paste0("file",i,".csv"))
    
    ## read in the data
    data_read <- read.table(file_name, sep=";", header = TRUE)
    
    ## add 81 to every number in the matrix
    i_result <- data.frame()
    for(j in 1:nrow(data_read)){
      j_row <- data_read[j, ]
      for(k in 1:length(j_row)){
        j_row[[k]] <- j_row[[k]]   81
      }
      i_result <- rbind(i_result, j_row)
    }
    
# Add the result to the original data frame


final <- rbind(i_result, random_data)
      }
      
      print("OK-2")
      
      unlink(dir_data, recursive = TRUE)
      
      if (!assertthat::are_equal(dget("Solution_1.dat"),final)){
        stop("final data.frame is not equal to the expected output")
      }

# Check how much memory is consumed by the function

return(invisible(sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt))
}

CodePudding user response:

I am not sure if you can capture the memory usage with gc() the way you do, as by the time you run it the second time the memory might already be freed up after the operation is completed. RStudio has the “Profile” Menu that helps you to profile code in regards to execution speed and memory usage.

Please see a version of your function below, that runs faster. I left the gc() calls in there, taking them out shaves off another second or so on my machine. Please see a comparison of speed at the bottom.

Faster function

  • does not use for-loops
  • uses vectorization when possible (see how file_names are created and how you can add a number to every cell in a matrix just with a instead of a for-loop)
b_function <- function() {
  #Store initial memory state
  tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
  
  set.seed(150)
  random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
  
  # Create the folder
  dir_data <- file.path("data")
  if (!dir.exists("data"))
    dir.create(dir_data, showWarnings = FALSE)
  
  # For every row in random_data, get some more data and write it to a file in dir
  
  file_names <-
    file.path(dir_data, paste0("file", 1:nrow(random_data), ".csv"))
  
  extra_random_data_l <- lapply(file_names, \(file_name) {
    extra_random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
    write.table(
      extra_random_data,
      file = file_name,
      sep = ";",
      row.names = FALSE
    )
  })
  
  # Read in the data we just stored 
  extra_random_datal2 <-
    lapply(file_names, read.table, sep = ";", header = TRUE)
  
  # Add 81 to each cell in each and rbind to original data
  result <-
    lapply(extra_random_datal2, \(x) rbind(random_data, x   81))

  unlink(dir_data, recursive = TRUE)
  
  # Check how much memory is consumed by the function
  
  return(invisible(sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt))
}

Original Function

a_function <- function() {
  #Store initial memory state
  tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
  set.seed(150)
  dir_data <- file.path("data")
  
  # Create the folder (already there)
  dir.create(dir_data, showWarnings = FALSE)
  random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
  
  # For every row in random_data, get some more data and write it to a file in dir
  
  for (i in 1:nrow(random_data)) {
    ## make the file name to write to
    file_name <- file.path(dir_data, paste0("file", i, ".csv"))
    
    set.seed(150)
    extra_random_data <- data.frame(matrix(rnorm(1e4), ncol = 100))
    
    write.table(
      extra_random_data,
      file = file_name,
      sep = ";",
      row.names = FALSE
    )
  }
  
  # For every row of x, read the data from the files and add 42 to the data you read and append it to the x data.frame
  
  for (i in 1:nrow(random_data)) {
    # Construct the file name
    file_name <- file.path(dir_data, paste0("file", i, ".csv"))
  
    ## read in the data
    data_read <- read.table(file_name, sep = ";", header = TRUE)
    
    ## add 81 to every number in the matrix
    i_result <- data.frame()
    for (j in 1:nrow(data_read)) {
      j_row <- data_read[j,]
      for (k in 1:length(j_row)) {
        j_row[[k]] <- j_row[[k]]   81
      }
      i_result <- rbind(i_result, j_row)
    }
    
    # Add the result to the original data frame
    final <- rbind(i_result, random_data)
  }
  
  print("OK-2")
  
  unlink(dir_data, recursive = TRUE)
  
  # if (!assertthat::are_equal(dget("Solution_1.dat"),final)){
  #   stop("final data.frame is not equal to the expected output")
  # }
  
  # Check how much memory is consumed by the function
  
  return(invisible(sum(.Internal(
    gc(FALSE, FALSE, TRUE)
  )[13:14]) - tt))
}

Speed comparison

library(tictoc)
tic()
a_function()
#> [1] "OK-2"
toc()
#> 30.467 sec elapsed

tic()
b_function()
toc()
#> 2.669 sec elapsed
  •  Tags:  
  • r
  • Related