Home > Enterprise >  R - Mean for different lengths of lists
R - Mean for different lengths of lists

Time:08-25

I have 5 lists of different lengths

a <- c(1) #with length of 1
b <- c(4.4,3.5) #length 2
c <- c(5.6,7.8,6.0) #length 3
d <- c(0.8,6.9,8.8,5.8) #length 4
e <- c(1.8,2.5,2.3,6.5,1.1) #length is 5

I am trying to get the mean of elements in all lists:

 #since there are 5 values available for 1st element
 a[1] b[1] c[1] d[1] e[1] / 5 

 #since there are 4 values available for 2nd element
 b[2] c[2] d[2] e[2] / 4 

#next divide by 3 and 2...1
c[3] d[3] e[3] / 3 and so on...

I need the mean of these values in another array so that I can do further processing of the data

Can anyone give suggestion on what to do to obtain the mean??

CodePudding user response:

Let

l <- list(a, b, c, d, e)

then do:

tapply(unlist(l), sequence(lengths(l)), mean)
    1     2     3     4     5 
2.720 5.175 5.700 6.150 1.100 

Another approach:

rowMeans(sapply(l, `length<-`, max(lengths(l))), na.rm = TRUE)
[1] 2.720 5.175 5.700 6.150 1.100

colMeans(plyr::rbind.fill.matrix(sapply(l, t)), na.rm = TRUE)
    1     2     3     4     5 
2.720 5.175 5.700 6.150 1.100 

CodePudding user response:

With data.table::transpose:

l <- list(a, b, c, d, e)
sapply(data.table::transpose(l), mean, na.rm = TRUE)
# [1] 2.720 5.175 5.700 6.150 1.100

CodePudding user response:

1) Create a list of the vectors and convert each to a ts object. Then use cbind to create a multivariate ts object with each column corresponding to one of the original vectors. Then use rowMeans on that.

nms <- c("a", "b", "c", "d", "e")
L <- mget(nms)
rowMeans(do.call("cbind", lapply(L, ts)), na.rm = TRUE)
## [1] 2.720 5.175 5.700 6.150 1.100

Check

(a[1]   b[1]   c[1]   d[1]   e[1]) / 5
## [1] 2.72

( b[2] c[2] d[2] e[2]) / 4 
## [1] 5.175

2) Another approach using L from above is to use sapply:

n <- max(lengths(L))
sapply(1:n, function(i) mean(c(a[i], b[i], c[i], d[i], e[i]), na.rm = TRUE))
## [1] 2.720 5.175 5.700 6.150 1.100

3) or use sapply like this where L and n are from above:

rowMeans(sapply(L, `[`, 1:n), na.rm = TRUE)

CodePudding user response:

Another method that put your vectors into a list, then add NA to the vectors to make them equal length. Finally do a rowMeans on them.

mylist <- list(a, b, c, d ,e)
max_L <- max(lengths(mylist))

rowMeans(sapply(mylist, \(x) c(x, rep(NA, max_L - length(x)))), na.rm = T)
[1] 2.720 5.175 5.700 6.150 1.100

CodePudding user response:

Using tidyverse

library(dplyr)
library(tidyr)
library(data.table)
mget(letters[1:5]) %>%
  enframe %>%
  unnest(value) %>%
  group_by(grp = rowid(name)) %>% 
  summarise(value = mean(value)) %>%
  pull(value)

-output

[1] 2.720 5.175 5.700 6.150 1.100

CodePudding user response:

You can try colMeans with option na.rm = TRUE like below

L <- max(lengths(lst))
m <- matrix(nrow = L, ncol = L)
m[cbind(rep(seq_along(lst), lengths(lst)), sequence(lengths(lst)))] <- unlist(lst)
colMeans(m, na.rm = TRUE)

which gives

[1] 2.720 5.175 5.700 6.150 1.100

CodePudding user response:

Another possible solution, based on the idea that the matrix resulting from row binding a, b, c, d, e is triangular:

m <- matrix(NA, 5, 5)
m[upper.tri(m, diag = T)] <- c(a, b, c, d, e)
colMeans(t(m), na.rm = T)

#> [1] 2.720 5.175 5.700 6.150 1.100

CodePudding user response:

Interesting to see the wide range of timings on a larger list:

library(plyr)
library(data.table)
library(dplyr)
library(tidyr)
library(tibble)

set.seed(976933858)
l <- unname(split(runif(1e5), sample(1e4, 1e5, TRUE)))

f0 <- function(l) diff(c(0, cumsum(unlist(l)[order(sequence(lengths(l)))])[cumsum(lens <- rev(cumsum(rev(tabulate(lengths(l))))))]))/lens # jblood94
f1 <- function(l) sapply(transpose(l), mean, na.rm = TRUE) # Mael
f2 <- function(l) as.numeric(tapply(unlist(l), sequence(lengths(l)), mean)) # onyambu
f3 <- function(l) rowMeans(sapply(l, `length<-`, max(lengths(l))), na.rm = TRUE) # onyambu
f4 <- function(L) rowMeans(sapply(L, `[`, 1:max(lengths(L))), na.rm = TRUE) # G. Grothendieck
f5 <- function(mylist) {
  # benson23
  max_L <- max(lengths(mylist))
  rowMeans(sapply(mylist, function(x) c(x, rep(NA, max_L - length(x)))), na.rm = T)
}
f6 <- function(l) l %>% enframe %>% unnest(value) %>% group_by(grp = rowid(name)) %>% summarise(value = mean(value)) %>% pull(value) # akrun
f7 <- function(l) unname(colMeans(rbind.fill.matrix(sapply(l, t)), na.rm = TRUE)) # onyambu
f8 <- function(L) rowMeans(do.call("cbind", lapply(L, ts)), na.rm = TRUE) # G. Grothendieck

microbenchmark::microbenchmark(
  jblood94 = f0(l),
  Mael = f1(l),
  onyambu1 = f2(l),
  onyambu2 = f3(l),
  G.Grothendieck1 = f4(l),
  benson23 = f5(l),
  akrun = f6(l),
  onyambu3 = f7(l),
  G.Grothendieck2 = f8(l),
  check = "identical"
)
#> Unit: milliseconds
#>             expr      min        lq       mean    median        uq      max neval
#>         jblood94   1.0303   1.15555   1.544470   1.22690   1.32600   6.4604   100
#>             Mael   1.7522   1.88085   3.214797   1.98215   2.11660  85.0957   100
#>         onyambu1   2.4569   2.61115   4.030907   2.77880   2.94575  99.1389   100
#>         onyambu2   4.1092   4.64760   6.034596   4.91770   6.23725  21.6162   100
#>  G.Grothendieck1   4.7224   5.11220   6.485828   5.63770   7.08075  12.9853   100
#>         benson23  10.7711  12.03325  16.352950  16.06155  19.34285  32.2033   100
#>            akrun  13.8533  16.11800  18.376538  17.09465  20.45810  39.1432   100
#>         onyambu3 342.7180 410.80445 465.461878 451.92355 504.14595 800.1453   100
#>  G.Grothendieck2 465.0375 509.92070 565.463577 553.64825 607.95115 777.0444   100
  • Related