I'm writing a agent-based model for phytoplankton. Right now it is fairly basic and includes functions for growth, uptake of a resource ("S" in the code), division, mortality, and a function to keep track of the total extracellular resource concentration outside of the cells (q represents the amount of resource in an individual, then I sum the q of all the individuals and subtract it from the total S). It has become quite a time hog from all the for loops taking more than 24 hours to run when it should be much faster. Are there ways that I can optimize my code to help it run faster?
uptake <- function(x){
vmax <- x[1]
km <- x[2]
S <- x[3]
result <- vmax*(S/(km S))
return(result)
}
growth <- function(x){
mumax <- x[1]
qnaught <- x[2]
q <- x[3]
result <- mumax*(1-(qnaught/q))
return(result)
}
division <- function(x){
mu <-x[1]
m <- x[2]
result <- mu*m
return(result)
}
extracellular <- function(x){
S <- x[1]
flow_rate <- x[2]
nutrient_inflow <- x[3]
quota_per_time <- x[4]
result <- ((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time)
return(result)
}
## Initial agents and time step parameters
num_agents <- 200
num_time_steps <- 400
## Set up data frame for agents
agents <- data.frame("ID" = 1:num_agents, "vmax" = 0.0000014, "km" = 17,
"mumax" = 1.1, "qnaught" = 0.00000036, "cell_size" = 0.00075,
"alive" = 1)
## Set up data frame for state variables
state <- data.frame("S" = 1.2)
agents <- cbind(agents, state)
state$flow_rate <- 0.86
state$nutrient_inflow <- 1.4
## Set up other parameter values
## q represents the amount of resource in an individual
q <- rep(0, num_agents)
size <- rep(0, num_agents)
m0 <- 0.00075
output <- data.frame()
agents_time <- data.frame()
## For loop for model
for (i in 1:num_time_steps) {
## mortality of agents in each time step
for (k in agents$ID){
random <- runif(1, min = 1, max = 100)
if (random < 10.0){
agents[k,7] = 0
}
}
agents_alive <- filter(agents, alive == 1)
## update S from state data frame
agents_alive$S <- state$S
q <- q[1:nrow(agents_alive)]
## sum uptake per time step to use for extracellular concentration
quota_per_agent <- apply(agents_alive[, c(2, 3, 8)], MARGIN = 1, FUN = function(x) uptake(x))
quota_per_time <- sum(quota_per_agent)
q <- q quota_per_agent
tmp <- cbind(agents_alive, q)
mu <- apply(tmp[, c(4, 5, 9)], MARGIN = 1, FUN = function(x) growth(x))
tmp <- cbind(tmp, mu)
size <- size[1:nrow(agents_alive)]
size <- size apply(tmp[, c(6, 10)], MARGIN = 1, FUN = function(x) division(x))
## if size calculated by division function is below minimum cell size (0.75),
## replace with minimum cell size
size <- ifelse(size < 0.00075, 0.00075, size)
index = 1
## division by cell size
for (j in size){
if (j > (2*tmp[index,6])){
size[index] = m0
q[index] = (q[index]/2)
last <- tail(agents$ID, n = 1)
new <- c(last 1, 0.0000014, 17,
1.1, 0.00000036, m0, 1, state$S)
agents <- rbind(agents, new)
agents_alive <- rbind(agents_alive, new)
q <- append(q, q[index])
mu <- append(mu, mu[index])
size <- append(size, size[index])
}
index = index 1
}
## update extracellular nutrient concentration state variable
state$quota_per_time <- quota_per_time
new_S <- state$S apply(state, MARGIN = 1, FUN = function(x) extracellular(x))
state <- replace(state, 1, new_S)
if (state$S < 0){
state$S <- 0
}
#dataframe of outputs for each time step
out_per_time <- data.frame("ID" = agents_alive$ID,
"vmax" = agents_alive$vmax,
"km" = agents_alive$km,
"S" = agents_alive$S,
"mumax" = agents_alive$mumax,
"qnaught" = agents_alive$qnaught,
"time" = rep(i, nrow(agents_alive)),
"q" = q,
"mu" = mu,
"cell_size" = size,
"alive" = agents_alive$alive)
#row-append to output dataframe that stores outputs for all agents and all time steps
output <- rbind(output, out_per_time)
}
CodePudding user response:
Using data.tables to implement the model will speed things up massively. A lot of the time here is spent on copying data within the data.frames and using a lot of memory to do so. data.tables use copy-in-place and much, much faster. I've done my best to understand the logic and replicate it... Where I may be off, hopefully an easy adjustment. Further, a number of for loops have been avoided by using data.table "set" operations and using mapply instead of apply functions.
library(data.table)
uptake2 <- function(vmax,km,S){
vmax*(S/(km S))
}
growth2 <- function(mumax,qnaught,q){
mumax*(1-(qnaught/q))
}
division2 <- function(mu,cell_size,min_cell_size){
max(mu*cell_size,min_cell_size)
}
extracellular2 <- function(S,flow_rate,nutrient_inflow,quota_per_time){
max(((flow_rate*nutrient_inflow)-(flow_rate*S)-quota_per_time),0)
}
## Initial agents and time step parameters
num_agents <- 10
num_time_steps <- 400
min_cell_size <- 0.00075
state <- data.table(S = 1.2)
## Set up data frame for agents
agents <- data.table(ID = 1:num_agents,
vmax = 0.0000014,
km = 17,
mumax = 1.1,
qnaught = 0.00000036,
min_cell_size = min_cell_size,
alive = 1,
S = state$S)
state[,flow_rate:= 0.86]
state[,nutrient_inflow:= 1.4]
## Set up other parameter values
output <- data.table(ID = numeric(0),
vmax = numeric(0),
km = numeric(0),
S = numeric(0),
mumax = numeric(0),
qnaught = numeric(0),
time = numeric(0),
q = numeric(0),
mu = numeric(0),
cell_size = numeric(0),
alive = numeric(0))
#agents_time <- data.frame()
## For loop for model
for (i in 1:num_time_steps) {
## mortality of agents in each time step
for (k in agents$ID){
random <- runif(1, min = 1, max = 100)
if (random < 10.0){
agents[k,alive:=0]
}
}
agents_alive <- agents[alive==1]
if (nrow(agents_alive)==0) next;
## sum uptake per time step to use for extracellular concentration
agents_alive[,q:=mapply(uptake2,
vmax=vmax,
km=km,
S=state$S)]
quota_per_time <- sum(agents_alive$q)
agents_alive[,mu:=mapply(growth2,
mumax=mumax,
qnaught=qnaught,
q=q)]
agents_alive[,size:=mapply(division2,
mu=mu,
cell_size=min_cell_size,
min_cell_size=min_cell_size)]
## division by cell size
create_new_agents <- nrow(agents_alive[size > 2*min_cell_size])
if (create_new_agents > 0) {
new_agents <- data.table(ID = max(agents$ID) (1:create_new_agents),
vmax = 0.0000014,
km = 17,
mumax = 1.1,
qnaught = 0.00000036,
cell_size = min_cell_size,
alive = 1,
S = state$S)
agents_alive[size > 2*cell_size,
`:=`(size=min_cell_size,
q=q/2)]
agents <- rbindlist(list(agents,
new_agents))
agents_alive <- rbindlist(list(agents_live,
new_agents))
}
## update extracellular nutrient concentration state variable
state$quota_per_time <- quota_per_time
state[,S:=extracellular2(S=S,
flow_rate=flow_rate,
nutrient_inflow=nutrient_inflow,
quota_per_time=quota_per_time)]
#row-append to output dataframe that stores outputs for all agents and all time steps
output <- rbindlist(list(output,
agents_alive[,
.(ID,
vmax,
km,
S,
mumax,
qnaught,
time=i,
q,
mu,
cell_size=size,
alive)]
))
print(i)
}
CodePudding user response:
I haven't looked at this in much detail, but I imagine that you may be able to avoid the internal for loop with the uniformly distributed random variate. You may be able to use (perhaps with some adjustment) something like:
agents$random <- runif(length(agents$ID), min = 1, max = 100)
agents$alive[agents$random < 10] <- 0
head(agents)
ID vmax km mumax qnaught cell_size alive S random
1 1 1.4e-06 17 1.1 3.6e-07 0.00075 0 1.2 5.313743
2 2 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 78.682158
3 3 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 49.320952
4 4 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 56.237847
5 5 1.4e-06 17 1.1 3.6e-07 0.00075 0 1.2 5.142588
6 6 1.4e-06 17 1.1 3.6e-07 0.00075 1 1.2 99.543501
If you still need the outer for-loop to iterate at each time step, you can essentially erase the "random" column afterwards with:
agents$random <- NULL