Improving the speed function by identifying the bottle neck of the function-CodePudding

I hope this question will not be considered to broad. I am however not sure how to ask it differently: In his youtube video, Ed Boone gives introduction into ABM with R. He writes the following code (only some variable names have been changed). The code runs fine with 1000 observations, but it becomes very slow when I amp up the observations. As a result I would like to improve the speed of the function(s). I could manually add start.time_x <- Sys.time() etc. after every line (to see what takes a lot of time), but I was wondering if there is a better way to identify the bottle neck, since all the nested for-loops make even that quite complicated (because I would also have to consider how many times each code runs) :

# ABM_Covid - II

start.time <- Sys.time()
Data_Generator <- function(nPop1, E0, I0) {
    # Create a population of susceptibles
    Data <- data.frame( AgentNo=1:nPop1,
                        State="Susceptible",
                        Mixing= runif(nPop1,0,1),
                        TimeE = 0,
                        TimeI = 0,
                        stringsAsFactors = FALSE)
    Data$State[1:E0] <- "Exposed"  # This just says that the first person is exposed, since the mixing is random anyway, this is not an issue (because the mixing of Exposed is random)
    Data$Time[1:E0] <- rbinom(E0, 13, 0.5)   1 # Exposure up to 14 days
    Data$State[(E0 1):(E0 I0)] <- "Infected"
    Data$Time[(E0 1):(E0 I0)] <- rbinom(I0, 12, 0.5)
    return(Data)
}

ABM_Covid <- function(Data, parameters, runtime){
    nPop1 <- nrow(Data)
    # runtime <- 15
    Results <- data.frame(  Susceptible     = rep (0, runtime),
                            Exposed         = rep (0, runtime),
                            Infected        = rep (0, runtime),
                            Recovered       = rep (0, runtime),
                            Deaths          = rep (0, runtime))
    # Move people through time
    for (k in 1:runtime){
        # Moving people through time
        StateSusceptible <- (1:nPop1)[Data$State == "Susceptible"]
        StateSusceptible_or_Exposed <- (1:nPop1)[Data$State == "Susceptible" | Data$State == "Exposed"]
        for (i in StateSusceptible) {
            # Determine if they like to meet others
            Mix1 <- Data$Mixing[i]
            # How many agents will they meet? The plus one meets everybody meets somebody
            Meetings <- round(Mix1*parameters$MaxMix,0)   1
            # Grab the agents they will meet
            People_met <- sample(StateSusceptible_or_Exposed, Meetings, replace=TRUE, prob = Data$Mixing[StateSusceptible_or_Exposed])
            for (j in 1:length(People_met)) {
                    # Grab who they will meet
                    Meetingsa <- Data[People_met[j], ]
                    # If exposed change State
                    if(Meetingsa$State== "Exposed") {
                        Urand1 <- runif(1,0,1)
                        if (Urand1 < parameters$S2E){
                            Data$State[i] <-  "Exposed"
                        }   
                    }
                }
            }
            # Grab those who have been exposed and increment
            StateE1 <- (1:nPop1)[Data$State== "Exposed"]
            Data$TimeE[StateE1] = Data$TimeE[StateE1]   1
            StateE2 <- (1:nPop1)[Data$State== "Exposed" & Data$TimeE > 14]
            Data$State[StateE2] <- "Recovered"    
            # Grab those who could become sick
            StateE3 <- (1:nPop1)[Data$State== "Exposed" & Data$TimeE > 3]
            for (i in StateE3){
                Urand1 <- runif(1,0,1)
                # randomly assign whether they get sick or not
                if ( Urand1 < parameters$E2I ) {
                    Data$State[i] <- "Infected"
                }
            }
            # Update how long they have been sick
            StateI1 <- (1:nPop1)[Data$State== "Infected"]
            Data$TimeI[StateI1] = Data$TimeI[StateI1]   1
            # Recovered bin
            StateI2 <- (1:nPop1)[Data$State== "Infected" & Data$TimeI > 14]
            Data$State[StateI2] <- "R"
            # Not recovered could potentially die
            StateI3 <- (1:nPop1)[Data$State== "Infected" & Data$TimeI < 15]
            Data$State[StateI3] <- ifelse(runif(length(StateI3), 0, 1 ) > parameters$I2D, "Infected", "Deaths")
            Results$Susceptible[k] <- length(Data$State[Data$State=="Susceptible"])
            Results$Exposed[k] <- length(Data$State[Data$State=="Exposed"])
            Results$Infected[k] <- length(Data$State[Data$State=="Infected"])
            Results$Recovered[k] <- length(Data$State[Data$State=="Recovered"])
            Results$Deaths[k] <- length(Data$State[Data$State=="Deaths"])
    }
    return(Results)
}

Data <- Data_Generator(1000, E0=5, I0=2)
parameters <- data.frame( MaxMix = 10,
                    S2E = 0.25,
                    E2I = 0.1,
                    I2D = 0.1)
Model1 <- ABM_Covid(Data, parameters, runtime=25)
plot(1:25, Model1$Susceptible, type="l", col="purple", ylim = c(0,1000))
lines(1:25, Model1$Exposed, type="l", col="orange")
lines(1:25, Model1$Infected, type="l", col="red")
lines(1:25, Model1$Recovered, type="l", col="seagreen")
lines(1:25, Model1$Deaths, type="l", col="black")
end.time <- Sys.time()
time.taken <- end.time - start.time

CodePudding user response：

Here is one way to optimize one part of your function (the update of the state). This way you get rid of the i loop and the j loop.

I am not sure if I understood every detail of the algorithm. But maybe you can use some parts and ideas of this solution anyways.

update_State <- function(nmeeting, seed = 123) {
  set.seed(seed)
  x <- Data %>%
    filter(State %in% c("Susceptible", "Exposed")) %>%
    slice_sample(n = nmeeting) %>%
    filter(State == "Exposed") %>%
    summarise(prob = any(runif(n()) <= paramters$S2E)) %>%
    pull(prob)
  
  return(if (x) "Exposed" else "Susceptible")
  
}

Data %>% 
  filter(State %in% c("Susceptible", "Exposed")) %>%
  mutate(
    n_meetings =  round(Mixing*parameters$MaxMix,0)   1,
    new_State = map(n_meetings, update_State) #this is the new State after all the Meetings have been made
  )

The idea is to check all exposed person with whom a given person meets all in once.

This runs in 4-5 seconds.

CodePudding user response：

Most versions of R are compiled with support for profiling, see ?Rprof. The use would be something like:

Rprof()
# <Your code without the system.time calls>
Rprof(Null)

summaryRprof()
# $by.self
#                 self.time self.pct total.time total.pct
# "["                  0.20    26.32       0.62     81.58
# "[[.data.frame"      0.16    21.05       0.24     31.58
# "[.data.frame"       0.10    13.16       0.42     55.26
# ...

Example based on this page.