Fastest way of putting together vectors from a for loop that have different lengths-CodePudding

I am running a query with a for-loop that creates vectors of different lengths. The code of the first vector is not necessarily the longest (I created the example so that the first column is the shortest and keeps getting longer). I want to somehow bind these vectors. I messed around a bit with this make-shift cbind.fill function, but did not get it to work (rowr::cbind.fill is no longer available in R 4.x.x).

The example code below does not run properly because runif(i) makes the vector longer for each subsequent loop. Please note that for the actual data I do not know which column is the longest. Checking this is possible but not preferred, although I can imagine that maybe just keeping the vectors until the loop is complete and binding them then would be faster.

Example code:

dat <- c(1,2,3)
dat <- as.data.frame(dat)
for (i in 1:5) {
    temp <- runif(i)
    dat <- cbind(dat, temp)
    names(dat)[i 1] <- paste0("nr", i)
}

What is the fastest way turn the output vectors into a data.frame? I thought of putting them into a list of vectors first (but did not know how to do the naming in that case), or perhaps filling every vectors with NA's of the amount length(longest_vector)-length(vector[i]).

Desired output:

# A tibble: 8 x 6
  dat   nr1                 nr2                 nr3                 nr4                    nr5
  <chr> <chr>               <chr>               <chr>               <chr>                <dbl>
1 1     1                   1                   1                   1                   1     
2 2     2                   2                   2                   2                   2     
3 3     3                   3                   3                   3                   3     
4 NA    0.43526783056537444 0.25560407791435225 0.91653997616714789 0.62635622073335406 0.888 
5 NA    NA                  0.56979342124862575 0.43296269966267631 0.46423817219260977 0.522 
6 NA    NA                  NA                  0.89399553062032511 0.34917334540558442 0.745 
7 NA    NA                  NA                  NA                  0.4131315834365703  0.0403
8 NA    NA                  NA                  NA                  NA                  0.564 

 desired_out <- structure(list(dat = c("1", "2", "3", "NA", "NA", "NA", "NA", 
"NA"), nr1 = c("1", "2", "3", "0.43526783056537444", "NA", "NA", 
"NA", "NA"), nr2 = c("1", "2", "3", "0.25560407791435225", "0.56979342124862575", 
"NA", "NA", "NA"), nr3 = c("1", "2", "3", "0.91653997616714789", 
"0.43296269966267631", "0.89399553062032511", "NA", "NA"), nr4 = c("1", 
"2", "3", "0.62635622073335406", "0.46423817219260977", "0.34917334540558442", 
"0.4131315834365703", "NA"), nr5 = c(1, 2, 3, 0.887930290142606, 
0.522131799371126, 0.745103223905874, 0.0403367661303002, 0.563609740553749
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-8L))

CodePudding user response：

You can use plyr::ldply for multiple vectors with different length

dat <- c(1,2,3)
dat <- as.data.frame(dat)
dat.list <- list(as.vector(t(dat)))
for (i in 1:5) {
  dat.list[[(i 1)]] <- runif(i)
}
dat <- t(plyr::ldply(dat.list, rbind))
colnames(dat) <- c("dat", paste0("nr", 1:5))

> dat
  dat       nr1       nr2       nr3       nr4        nr5
1   1 0.8714848 0.8165862 0.5245153 0.7647691 0.15276644
2   2        NA 0.1034356 0.3850973 0.1769444 0.56437654
3   3        NA        NA 0.3773377 0.9142521 0.31727519
4  NA        NA        NA        NA 0.5343319 0.44647840
5  NA        NA        NA        NA        NA 0.07558151

Check time cost using microbenchmark::microbenchmark

microbenchmark::microbenchmark(
  a = {dat <- c(1,2,3)
  dat <- as.data.frame(dat)
  dat.list <- list(as.vector(t(dat)))
  for (i in 1:5) {
    dat.list[[(i 1)]] <- runif(i)
  }
  dat <- t(plyr::ldply(dat.list, rbind))
  colnames(dat) <- c("dat", paste0("nr", 1:5))}
)

Unit: milliseconds
 expr   min     lq     mean median      uq  max neval
    a 5.008 5.3714 5.844143 5.6862 5.98705 9.84   100

For a 1000 -1000 length vector -,

microbenchmark::microbenchmark(
  a = {dat <- c(1,2,3)
  dat <- as.data.frame(dat)
  dat.list <- list(as.vector(t(dat)))
  for (i in 1:1000) {
    dat.list[[(i 1)]] <- runif(1000)
  }
  dat <- t(plyr::ldply(dat.list, rbind))
  colnames(dat) <- c("dat", paste0("nr", 1:1000))}
)

Unit: milliseconds
 expr      min      lq     mean   median       uq      max neval
    a 127.9646 132.236 151.2108 135.2484 141.3047 369.3313   100

CodePudding user response：

The first few rows of the desired output look like they don't add information.

solution 1

f <- function(vec, l){
  nms <- paste0("nr", 1:l)
  m1 <- matrix(vec, nrow = length(vec), ncol = l)
  colnames(m1) <- nms
  m <- matrix(, nrow = l, ncol = l)
  for (i in seq_along(1:l)) {
    m[1:i, i] <- runif(i)
  }
  colnames(m) <- nms
  dplyr::bind_rows(tibble::as_tibble(m1), tibble::as_tibble(m))
}

f(c(1L,2L,3L), 5L)
# A tibble: 8 x 5
     nr1     nr2    nr3    nr4   nr5
   <dbl>   <dbl>  <dbl>  <dbl> <dbl>
1  1      1       1      1     1    
2  2      2       2      2     2    
3  3      3       3      3     3    
4  0.244  0.0891  0.881  0.749 0.332
5 NA      0.407   0.519  0.479 0.113
6 NA     NA       0.633  0.561 0.593
7 NA     NA      NA      0.409 0.631
8 NA     NA      NA     NA     0.992

Further optimization can be done by translating the for-loop into C , and removing the addition of first few rows of the desired output.

solution 2

f2 <- function(vec, l){
  m <- matrix(, nrow = l, ncol = l)
  m[upper.tri(m, diag = TRUE)] <- runif(n = l^2 - l*(l-1) / 2)
  colnames(m) <- paste0("nr", 1:l)
  as.data.frame(m)
}

Note that the desired output has character vectors as output - this is questionable memory management. mutate(across(where(is.numeric), as.character)) can be used if desired.

Benchmarks

Benchmarks are performed on the workhorse of the functions, creating a 1000 by 1000 data.frame.

# rewriting f to not output first rows
f_clean <- function(vec, l){
  m <- matrix(, nrow = l, ncol = l)
  for (i in seq_along(1:l)) m[1:i, i] <- runif(i)
  colnames(m) <- paste0("nr", 1:l)
  as.data.frame(m)
}

bench::mark(f = {set.seed(1);f_clean(1L:1000L, 1000L)},
            f2 = {set.seed(1); f2(1L:1000L, 1000L)}, iterations = 100)[c(3,5,7)]
    median mem_alloc n_itr
  <bch:tm> <bch:byt> <int>
1   22.4ms    31.3MB    94 #note: tibble output is slightly more memory(27MB) friendly
2     26ms      44MB    88