Is there a way to optimize my for loops or use another function so it runs quicker?-CodePudding

I'm writing a agent-based model for phytoplankton. Right now it is fairly basic and includes functions for growth, uptake of a resource ("S" in the code), division, mortality, and a function to keep track of the total extracellular resource concentration outside of the cells (q represents the amount of resource in an individual, then I sum the q of all the individuals and subtract it from the total S). It has become quite a time hog from all the for loops taking more than 24 hours to run when it should be much faster. Are there ways that I can optimize my code to help it run faster?

uptake <- function(x){
  vmax <- x[1]
  km <- x[2]
  S <- x[3]
  result <- vmax*(S/(km S))
  return(result)
}

growth <- function(x){
  mumax <- x[1]
  qnaught <- x[2]
  q <- x[3]
  result <- mumax*(1-(qnaught/q))
  return(result)
}

division <- function(x){
  mu <-x[1]
  m <- x[2]
  result <- mu*m
  return(result)
}

extracellular <- function(x){
  S <- x[1]
  flow_rate <- x[2]
  nutrient_inflow <- x[3]
  quota_per_time <- x[4]
  result <- ((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time) 
  return(result)
}


## Initial agents and time step parameters
num_agents <- 200
num_time_steps <- 400

## Set up data frame for agents
agents <- data.frame("ID" = 1:num_agents, "vmax" = 0.0000014, "km" = 17, 
                     "mumax" = 1.1, "qnaught" = 0.00000036, "cell_size" = 0.00075, 
                     "alive" = 1)
## Set up data frame for state variables
state <- data.frame("S" = 1.2)
agents <- cbind(agents, state)
state$flow_rate <- 0.86
state$nutrient_inflow <- 1.4
## Set up other parameter values
## q represents the amount of resource in an individual
q <- rep(0, num_agents)
size <- rep(0, num_agents)
m0 <- 0.00075
output <- data.frame()
agents_time <- data.frame()

## For loop for model
for (i in 1:num_time_steps) {
  ## mortality of agents in each time step
  for (k in agents$ID){
    random <- runif(1, min = 1, max = 100)
    if (random < 10.0){
      agents[k,7] = 0
    }
  }
  agents_alive <- filter(agents, alive == 1)
  
  ## update S from state data frame
  agents_alive$S <- state$S
  q <- q[1:nrow(agents_alive)]
  ## sum uptake per time step to use for extracellular concentration

  quota_per_agent <- apply(agents_alive[, c(2, 3, 8)], MARGIN = 1, FUN = function(x) uptake(x))
  quota_per_time <- sum(quota_per_agent)
  q <- q   quota_per_agent
  tmp <- cbind(agents_alive, q)
  mu <- apply(tmp[, c(4, 5, 9)], MARGIN = 1, FUN = function(x) growth(x))
  tmp <- cbind(tmp, mu)
  size <- size[1:nrow(agents_alive)]
  size <- size   apply(tmp[, c(6, 10)], MARGIN = 1, FUN = function(x) division(x))

  ## if size calculated by division function is below minimum cell size (0.75), 
  ## replace with minimum cell size
  size <- ifelse(size < 0.00075, 0.00075, size)
  index = 1
  ## division by cell size
  for (j in size){
    if (j > (2*tmp[index,6])){
      size[index] = m0
      q[index] = (q[index]/2)
      last <- tail(agents$ID, n = 1)
      new <- c(last   1, 0.0000014, 17,
               1.1, 0.00000036, m0, 1, state$S)
      agents <- rbind(agents, new)
      agents_alive <- rbind(agents_alive, new)
      q <- append(q, q[index])
      mu <- append(mu, mu[index])
      size <- append(size, size[index])
    }
    index = index   1
  }

  ## update extracellular nutrient concentration state variable
  state$quota_per_time <- quota_per_time
  new_S <- state$S   apply(state, MARGIN = 1, FUN = function(x) extracellular(x))
  state <- replace(state, 1, new_S)
  if (state$S < 0){
    state$S <- 0
  }
  #dataframe of outputs for each time step
  out_per_time <- data.frame("ID" = agents_alive$ID, 
                             "vmax" = agents_alive$vmax, 
                             "km" = agents_alive$km,
                             "S" = agents_alive$S, 
                             "mumax" = agents_alive$mumax,
                             "qnaught" = agents_alive$qnaught,
                             "time" = rep(i, nrow(agents_alive)), 
                             "q" = q,
                             "mu" = mu,
                             "cell_size" = size,
                             "alive" = agents_alive$alive)
  
  #row-append to output dataframe that stores outputs for all agents and all time steps
  output <- rbind(output, out_per_time)
}

CodePudding user response：

Using data.tables to implement the model will speed things up massively. A lot of the time here is spent on copying data within the data.frames and using a lot of memory to do so. data.tables use copy-in-place and much, much faster. I've done my best to understand the logic and replicate it... Where I may be off, hopefully an easy adjustment. Further, a number of for loops have been avoided by using data.table "set" operations and using mapply instead of apply functions.

library(data.table)

uptake2 <- function(vmax,km,S){
  vmax*(S/(km S))
}

growth2 <- function(mumax,qnaught,q){
  mumax*(1-(qnaught/q))
}

division2 <- function(mu,cell_size,min_cell_size){
  max(mu*cell_size,min_cell_size)
}

extracellular2 <- function(S,flow_rate,nutrient_inflow,quota_per_time){
  max(((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time),0) 
}


## Initial agents and time step parameters
num_agents <- 10
num_time_steps <- 400
min_cell_size <- 0.00075

state <- data.table(S = 1.2)

## Set up data frame for agents
agents <- data.table(ID = 1:num_agents, 
                     vmax = 0.0000014, 
                     km = 17, 
                     mumax = 1.1, 
                     qnaught = 0.00000036, 
                     min_cell_size = min_cell_size, 
                     alive = 1,
                     S = state$S)

state[,flow_rate:= 0.86]
state[,nutrient_inflow:= 1.4]
## Set up other parameter values


output <- data.table(ID = numeric(0),
                     vmax = numeric(0),
                     km = numeric(0),
                     S = numeric(0),
                     mumax = numeric(0),
                     qnaught = numeric(0),
                     time = numeric(0),
                     q = numeric(0),
                     mu = numeric(0),
                     cell_size = numeric(0),
                     alive = numeric(0))


#agents_time <- data.frame()

## For loop for model
for (i in 1:num_time_steps) {
  ## mortality of agents in each time step
  for (k in agents$ID){
    random <- runif(1, min = 1, max = 100)
    if (random < 10.0){
      agents[k,alive:=0]
    }
  }
  agents_alive <- agents[alive==1]

  if (nrow(agents_alive)==0) next;
  
  ## sum uptake per time step to use for extracellular concentration
  agents_alive[,q:=mapply(uptake2,
                          vmax=vmax,
                          km=km,
                          S=state$S)]
  
  quota_per_time <- sum(agents_alive$q)

  agents_alive[,mu:=mapply(growth2,
                           mumax=mumax,
                           qnaught=qnaught,
                           q=q)]
  
  agents_alive[,size:=mapply(division2,
                             mu=mu,
                             cell_size=min_cell_size,
                             min_cell_size=min_cell_size)]

  ## division by cell size
  create_new_agents <- nrow(agents_alive[size > 2*min_cell_size])
  if (create_new_agents > 0) {
    new_agents <- data.table(ID = max(agents$ID) (1:create_new_agents),
                             vmax = 0.0000014, 
                             km = 17, 
                             mumax = 1.1, 
                             qnaught = 0.00000036, 
                             cell_size = min_cell_size, 
                             alive = 1,
                             S = state$S)

    agents_alive[size > 2*cell_size,
                 `:=`(size=min_cell_size,
                      q=q/2)]
    
    agents <- rbindlist(list(agents,
                             new_agents))
    
    agents_alive <- rbindlist(list(agents_live,
                                   new_agents))
    
  }


  
  ## update extracellular nutrient concentration state variable
  state$quota_per_time <- quota_per_time
  
  state[,S:=extracellular2(S=S,
                           flow_rate=flow_rate,
                           nutrient_inflow=nutrient_inflow,
                           quota_per_time=quota_per_time)]
  
  #row-append to output dataframe that stores outputs for all agents and all time steps
  output <- rbindlist(list(output, 
                           agents_alive[,
                                        .(ID,
                                          vmax,
                                          km,
                                          S,
                                          mumax,
                                          qnaught,
                                          time=i,
                                          q,
                                          mu,
                                          cell_size=size,
                                          alive)]
                           ))
  print(i)
}

CodePudding user response：

I haven't looked at this in much detail, but I imagine that you may be able to avoid the internal for loop with the uniformly distributed random variate. You may be able to use (perhaps with some adjustment) something like:

agents$random <- runif(length(agents$ID), min = 1, max = 100)
agents$alive[agents$random < 10] <- 0

head(agents)
  ID    vmax km mumax qnaught cell_size alive   S    random
1  1 1.4e-06 17   1.1 3.6e-07   0.00075     0 1.2  5.313743
2  2 1.4e-06 17   1.1 3.6e-07   0.00075     1 1.2 78.682158
3  3 1.4e-06 17   1.1 3.6e-07   0.00075     1 1.2 49.320952
4  4 1.4e-06 17   1.1 3.6e-07   0.00075     1 1.2 56.237847
5  5 1.4e-06 17   1.1 3.6e-07   0.00075     0 1.2  5.142588
6  6 1.4e-06 17   1.1 3.6e-07   0.00075     1 1.2 99.543501

If you still need the outer for-loop to iterate at each time step, you can essentially erase the "random" column afterwards with:

agents$random <- NULL