I want to map values x
to new values y
, based on which (irregular) vector they are in (lookup
), and the value associated with that vector (map
). How can I do this efficiently?
x = seq(9)
lookup = list(A=c(1,2,3,9),B=c(4,5,6),C=c(7,8))
map = list(A=.5,B=.4,C=.3)
# MAGIC should yield y like:
y = c(.5,.5,.5,.4,.4,.4,.3,.3,.5)
If it matters, in my case, every element in x
is guaranteed to be in lookup
, but not all elements in lookup
will be in x
.
EDIT: here are some bigger data to play with re. efficiency:
N = 1e5
x = seq(N)
x.A = sample(x,N*.5)
x.B = sample(x[-x.A],N*.35)
x.C = sample(x[-c(x.A,x.B)],N*.1)
x.D = sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup = list(A=x.A, B=x.B, C=x.C, D=x.D)
map = list(A=50,B=35,C=10,D=5)
CodePudding user response:
We may rep
licate the 'map' elements based on the lengths
of 'lookup' and then assign a copy of the 'x' with the index of unlist
ed 'lookup'
lookup2 <- lapply(lookup, function(u) u[u %in% x])
y1 <- x
y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
-checking with OP's 'y'
> identical(y1, y)
[1] TRUE
Or may also use a for
loop
y1 <- x
for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
identical(y1, y)
[1] TRUE
With the OP's bigger data, for
loop is slightly faster
> system.time({y1 <- x
for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
})
user system elapsed
0.100 0.004 0.104
> system.time({lookup2 <- lapply(lookup, function(u) u[u %in% x])
y1 <- x
y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
})
user system elapsed
0.113 0.003 0.116
Or another option is to stack
the named list
into a two column data.frame, merge
by the 'ind' column, order
and extract the 'values.x'
dat <- merge(stack(map), stack(lookup), by = "ind")
dat[order(dat$values.y),]$values.x
[1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5
NOTE: map
is a function name in package purrr
. It is better to assign object names that are not function names
CodePudding user response:
It will probably work better if we index everything in lookup
first, then check if it is in x
:
x <- seq(9)
lookup <- list(A = c(1,2,3,9), B = c(4,5,6), C = c(7,8))
vmap <- list(A = 0.5, B = 0.4, C = 0.3)
y <- unlist(lookup, use.names = FALSE)
y[y] <- rep.int(unlist(vmap, use.names = FALSE), lengths(lookup))[y %in% x]
y
#> [1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5
N <- 1e5
x <- seq(N)
x.A <- sample(x,N*.5)
x.B <- sample(x[-x.A],N*.35)
x.C <- sample(x[-c(x.A,x.B)],N*.1)
x.D <- sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup <- list(A=x.A, B=x.B, C=x.C, D=x.D)
map <- list(A=50,B=35,C=10,D=5)
f1 <- function(x, lookup, map) {
lookup2 <- lapply(lookup, function(u) u[u %in% x])
y <- x
y[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
y
}
f2 <- function(x, lookup, map) {
y <- x
for(nm in names(lookup)) y[intersect(lookup[[nm]], x)] <- map[[nm]]
y
}
f3 <- function(x, lookup, map) {
y <- unlist(lookup, use.names = FALSE)
y[y] <- rep.int(unlist(map, use.names = FALSE), lengths(lookup))[y %in% x]
y
}
microbenchmark::microbenchmark(f1 = f1(x, lookup, map),
f2 = f2(x, lookup, map),
f3 = f3(x, lookup, map),
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 121.4449 129.71180 135.94304 133.56695 139.6493 254.2026 100
#> f2 88.0754 92.04575 98.49070 98.13595 103.2960 122.0160 100
#> f3 33.2488 34.06970 37.24117 34.88195 40.4042 52.7759 100