Efficiently map values with irregular lookup table-CodePudding

I want to map values x to new values y, based on which (irregular) vector they are in (lookup), and the value associated with that vector (map). How can I do this efficiently?

x = seq(9)
lookup = list(A=c(1,2,3,9),B=c(4,5,6),C=c(7,8))
map = list(A=.5,B=.4,C=.3)
# MAGIC should yield y like:
y = c(.5,.5,.5,.4,.4,.4,.3,.3,.5)

If it matters, in my case, every element in x is guaranteed to be in lookup, but not all elements in lookup will be in x.

EDIT: here are some bigger data to play with re. efficiency:

N = 1e5
x = seq(N)
x.A = sample(x,N*.5)
x.B = sample(x[-x.A],N*.35)
x.C = sample(x[-c(x.A,x.B)],N*.1)
x.D = sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup = list(A=x.A, B=x.B, C=x.C, D=x.D)
map = list(A=50,B=35,C=10,D=5)

CodePudding user response：

We may replicate the 'map' elements based on the lengths of 'lookup' and then assign a copy of the 'x' with the index of unlisted 'lookup'

lookup2 <- lapply(lookup, function(u) u[u %in% x])
y1 <- x
y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))

-checking with OP's 'y'

> identical(y1, y)
[1] TRUE

Or may also use a for loop

y1 <- x
for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
identical(y1, y)
[1] TRUE

With the OP's bigger data, for loop is slightly faster

> system.time({y1 <- x
   for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
   })
   user  system elapsed 
  0.100   0.004   0.104 
> system.time({lookup2 <- lapply(lookup, function(u) u[u %in% x])
  y1 <- x
  y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
 })
   user  system elapsed 
  0.113   0.003   0.116

Or another option is to stack the named list into a two column data.frame, merge by the 'ind' column, order and extract the 'values.x'

dat <- merge(stack(map), stack(lookup), by = "ind")
dat[order(dat$values.y),]$values.x
[1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5

NOTE: map is a function name in package purrr. It is better to assign object names that are not function names

CodePudding user response：

It will probably work better if we index everything in lookup first, then check if it is in x:

x <- seq(9)
lookup <- list(A = c(1,2,3,9), B = c(4,5,6), C = c(7,8))
vmap <- list(A = 0.5, B = 0.4, C = 0.3)

y <- unlist(lookup, use.names = FALSE)
y[y] <- rep.int(unlist(vmap, use.names = FALSE), lengths(lookup))[y %in% x]
y
#> [1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5

N <- 1e5
x <- seq(N)
x.A <- sample(x,N*.5)
x.B <- sample(x[-x.A],N*.35)
x.C <- sample(x[-c(x.A,x.B)],N*.1)
x.D <- sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup <- list(A=x.A, B=x.B, C=x.C, D=x.D)
map <- list(A=50,B=35,C=10,D=5)

f1 <- function(x, lookup, map) {
  lookup2 <- lapply(lookup, function(u) u[u %in% x])
  y <- x
  y[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
  y
}

f2 <- function(x, lookup, map)  {
  y <- x
  for(nm in names(lookup)) y[intersect(lookup[[nm]], x)] <- map[[nm]]
  y
}

f3 <- function(x, lookup, map) {
  y <- unlist(lookup, use.names = FALSE)
  y[y] <- rep.int(unlist(map, use.names = FALSE), lengths(lookup))[y %in% x]
  y
}

microbenchmark::microbenchmark(f1 = f1(x, lookup, map),
                               f2 = f2(x, lookup, map),
                               f3 = f3(x, lookup, map),
                               check = "identical")
#> Unit: milliseconds
#>  expr      min        lq      mean    median       uq      max neval
#>    f1 121.4449 129.71180 135.94304 133.56695 139.6493 254.2026   100
#>    f2  88.0754  92.04575  98.49070  98.13595 103.2960 122.0160   100
#>    f3  33.2488  34.06970  37.24117  34.88195  40.4042  52.7759   100