Optimizing ifelse loop in R-CodePudding

[[[[ I've been trying to optimize a loop in R, but because I'm not an expert, I can't make much progress. I was wondering if you could help me because it's taking way too much time.]]]]

Basically, I have a data frame and a list of data frames, as these ones here below:


set.seed(123)

spp.list <- c("spA", "spB")
locations <- c("loc1", "loc2")
depths <- c(0:1)
years <- c(2000, 2001)
months <- c(1,2)

#Dataframe 1 (f.data):
n.rows <- 10
f.data <- data.frame(spp = sample(spp.list, n.rows, replace = T), 
                     location = sample(locations, n.rows, replace = T), 
                     depth = sample(depths, n.rows, replace = T),
                     Y = sample(years, n.rows, replace = T),
                     M = sample(months, n.rows, replace = T)
)




#List of dataframes (loc.list)
loc1 <- data.frame(Y = years,
                   M = months,
                   '0' = c(10,15),
                   '1' = c(0,5)
)
names(loc1)[3:4] <-  c(-0,-1)


loc2 <- data.frame(Y = years,
                   M = months,
                   '0' = c(13,18),
                   '1' = c(3,7)
)
names(loc2)[3:4] <-  c(-0,-1)

loc.list <- list(loc1,loc2)

names(loc.list) <- c('loc1','loc2')

Dataframe 1 (f.data) contains a list of species, locality, depth, year and month. The list of dataframes (loc.list) contains individual dataframes for each locality (pretty much the same localities in f.data). Each individual dataframe in this list will also contain year and month, but also a value for distinct categories of an element (depth; each depth in this case is represented by a independent columns: 0 = surface, -1 = 1 m deep).

What I need to do is to screen both elements to match location, depth, year and month, so that I can assign the value recorded in loc.list into f.data. For instance,the first row of f.data says 'loc2', depth '0', year 2000, and month 1. Looking at loc.list, the value for loc2, year 2000, month 1 and depth 0 (column 0) is 13. Therefore, I'll copy this value (13) into a new column in f.data (f.data$temp).

> f.data
   spp location depth    Y M
1  spA     loc2     0 2000 1 <<<----
2  spA     loc2     1 2001 2
3  spA     loc2     0 2000 2
4  spB     loc1     0 2001 1
5  spA     loc2     0 2001 1
6  spB     loc1     0 2000 1
7  spB     loc2     1 2000 1
8  spB     loc1     1 2000 2
9  spA     loc1     0 2000 1
10 spA     loc1     1 2001 1

> loc.list
$loc1
     Y M  0 -1
1 2000 1 10  0
2 2001 2 15  5

$loc2
     Y M  0 -1
1 2000 1 13  3  <<<----
2 2001 2 18  7

Initially, I wrote a rudimentary long code that does the work, but it takes its time. For a n.rows = 100000, for instance, I need ~ 18 seconds in my machine.

f.data$temp <- NA

start.time <- Sys.time()

for (i in (1:nrow(f.data))) { 
 
 tryCatch({
   
   for (j in 1:length(loc.list)) { 
     
     for (k in 1:nrow(loc.list[[j]])) { 
       
       for (m in 3:ncol(loc.list[[j]])) {
         
         if (f.data$location[i] == names(loc.list)[j]) {
           
           if (f.data$Y[i] == loc.list[[j]]$Y[k]){ 
             
             if (f.data$M[i] == loc.list[[j]]$M[k]) { 
               
               if (round(f.data$depth[i], digits = 0) == (as.numeric (names(loc.list[[j]])[m])*(-1))) { 
                 
                 f.data$temp[i] <- loc.list[[j]][k,m]
                 
               }
             }
           }
         }
       }
     }
   }
 }, error = function(e){})
}

end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken

> f.data
  spp location depth    Y M temp
1  spA     loc2     0 2000 1   13
2  spA     loc2     1 2001 2    7
3  spA     loc2     0 2000 2   NA
4  spB     loc1     0 2001 1   NA
5  spA     loc2     0 2001 1   NA
6  spB     loc1     0 2000 1   10
7  spB     loc2     1 2000 1    3
8  spB     loc1     1 2000 2   NA
9  spA     loc1     0 2000 1   10
10 spA     loc1     1 2001 1   NA

I improved a bit the code using ifelse(), but within a classical loop. With n.rows = 100000, I get it done in less than a second.

f.data2 <- f.data[,-length(f.data)]

res2 <- c()

start.time2 <- Sys.time()

for (i in 1:length(loc.list)) { # to assess each df in the list
  for (j in 1:nrow(loc.list[[i]])) { # to assess each row of each df in the list
    for (m in 3:ncol(loc.list[[i]])) { # to assess each colum of each df in the list
      
      res <-  ifelse(f.data2$location == names(loc.list)[i]  &
                       f.data2$Y == loc.list[[i]]$Y[j]  &
                       f.data2$M == loc.list[[i]]$M[j] &
                       round(f.data2$depth, digits = 0) == (as.numeric (names(loc.list[[i]])[m])*(-1)),
                     loc.list[[i]][j,m], NA
      )
      res2 <- cbind(res2,res)       
          }
  }
}

end.time2 <- Sys.time()
time.taken2 <- end.time2 - start.time2
time.taken2

f.data2 <- cbind(f.data2,res2)       

f.data2$res.final <-  rowMeans(f.data2[,(ncol(f.data2)-ncol(res2) 1):(ncol(f.data2))],
                               na.rm=T)

f.data2 <- f.data2[, -c((ncol(f.data2)-ncol(res2)):(ncol(f.data2)-1)) ]


f.data2
f.data

sum(!(f.data$temp == f.data2$res.final), na.rm=T)

But because in reality I have a f.data with 88062 rows and a loc.list with 58 dfs that vary a lot in size ( 81–479 x 9–375 rows and columns, respec.), my 'optimized' code is still taking forever. I'd appreciate it a lot if anyone could give an insight on how to make this faster. Txs. L

CodePudding user response：

This sounds like a join, which can be done quite fast without a loop. Here, I combine the list of data frames into one data frame with a location column holding the name of each original table. Then we join (here with dplyr::left_join but could use base merge, or data.table or duckdb or collapse for more speed if needed).

library(tidyverse) 
loc.list.df <- do.call(rbind.data.frame, loc.list) %>%
  rownames_to_column() %>%
  separate(rowname, c("location", "row"))

f.data %>%
  left_join(loc.list.df)

Result

Joining, by = c("location", "Y", "M")
   spp location depth    Y M  row  0 -1
1  spA     loc2     0 2000 1    1 13  3
2  spA     loc2     1 2001 2    2 18  7
3  spA     loc2     0 2000 2 <NA> NA NA
4  spB     loc1     0 2001 1 <NA> NA NA
5  spA     loc2     0 2001 1 <NA> NA NA
6  spB     loc1     0 2000 1    1 10  0
7  spB     loc2     1 2000 1    1 13  3
8  spB     loc1     1 2000 2 <NA> NA NA
9  spA     loc1     0 2000 1    1 10  0
10 spA     loc1     1 2001 1 <NA> NA NA

CodePudding user response：

Thanks, @Limey and @Jon, for your comments. It helped a lot. First I followe Limey's suggestion and bound the list of dataframes in a single one. (More elegant codes are probably available):

loc.list.merged <- list()

for (j in 1:length(loc.list)) {

loc.list1 <- loc.list[[j]]

x.loc.list <- list()

for (i in 3:ncol(loc.list1)) {
  
  x <- data.frame(loc.list1[,i])
  names(x) <- 'temp'
  
  x$depth <- names(loc.list1)[i]
  x$Y <- loc.list1$Y
  x$M <- loc.list1$M
  x$locality <- names(loc.list[j])
  
  x.loc.list[[i-2]] <- x

  }
 
library(dplyr)
yy <- Reduce(full_join,x.loc.list)

loc.list.merged[[j]] <-  yy

}

loc.list.merged2 <- Reduce(full_join, loc.list.merged)
loc.list.merged2$depth <- as.numeric(loc.list.merged2$depth)*(-1)
names(loc.list.merged2)[5] <- 'location'

> loc.list.merged2
  temp depth    Y M location
1   10     0 2000 1     loc1
2   15     0 2001 2     loc1
3    0     1 2000 1     loc1
4    5     1 2001 2     loc1
5   13     0 2000 1     loc2
6   18     0 2001 2     loc2
7    3     1 2000 1     loc2
8    7     1 2001 2     loc2

Then I applied Jon's suggestion:


library(dplyr)
f.data4 <- 
  f.data3 %>% left_join(loc.list.merged2)

> f.data4
   spp location depth    Y M temp
1  spA     loc2     0 2000 1   13
2  spA     loc2     1 2001 2    7
3  spA     loc2     0 2000 2   NA
4  spB     loc1     0 2001 1   NA
5  spA     loc2     0 2001 1   NA
6  spB     loc1     0 2000 1   10
7  spB     loc2     1 2000 1    3
8  spB     loc1     1 2000 2   NA
9  spA     loc1     0 2000 1   10
10 spA     loc1     1 2001 1   NA

It's seems to work. I'll try tomorrow on my actual dataset.