Home > Mobile >  Efficiently map values with irregular lookup table
Efficiently map values with irregular lookup table

Time:08-07

I want to map values x to new values y, based on which (irregular) vector they are in (lookup), and the value associated with that vector (map). How can I do this efficiently?

x = seq(9)
lookup = list(A=c(1,2,3,9),B=c(4,5,6),C=c(7,8))
map = list(A=.5,B=.4,C=.3)
# MAGIC should yield y like:
y = c(.5,.5,.5,.4,.4,.4,.3,.3,.5)

If it matters, in my case, every element in x is guaranteed to be in lookup, but not all elements in lookup will be in x.


EDIT: here are some bigger data to play with re. efficiency:

N = 1e5
x = seq(N)
x.A = sample(x,N*.5)
x.B = sample(x[-x.A],N*.35)
x.C = sample(x[-c(x.A,x.B)],N*.1)
x.D = sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup = list(A=x.A, B=x.B, C=x.C, D=x.D)
map = list(A=50,B=35,C=10,D=5)

CodePudding user response:

We may replicate the 'map' elements based on the lengths of 'lookup' and then assign a copy of the 'x' with the index of unlisted 'lookup'

lookup2 <- lapply(lookup, function(u) u[u %in% x])
y1 <- x
y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))

-checking with OP's 'y'

> identical(y1, y)
[1] TRUE

Or may also use a for loop

y1 <- x
for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
identical(y1, y)
[1] TRUE

With the OP's bigger data, for loop is slightly faster

> system.time({y1 <- x
   for(nm in names(lookup)) y1[intersect(lookup[[nm]], x)] <- map[[nm]]
   })
   user  system elapsed 
  0.100   0.004   0.104 
> system.time({lookup2 <- lapply(lookup, function(u) u[u %in% x])
  y1 <- x
  y1[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
 })
   user  system elapsed 
  0.113   0.003   0.116 

Or another option is to stack the named list into a two column data.frame, merge by the 'ind' column, order and extract the 'values.x'

dat <- merge(stack(map), stack(lookup), by = "ind")
dat[order(dat$values.y),]$values.x
[1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5

NOTE: map is a function name in package purrr. It is better to assign object names that are not function names

CodePudding user response:

It will probably work better if we index everything in lookup first, then check if it is in x:

x <- seq(9)
lookup <- list(A = c(1,2,3,9), B = c(4,5,6), C = c(7,8))
vmap <- list(A = 0.5, B = 0.4, C = 0.3)

y <- unlist(lookup, use.names = FALSE)
y[y] <- rep.int(unlist(vmap, use.names = FALSE), lengths(lookup))[y %in% x]
y
#> [1] 0.5 0.5 0.5 0.4 0.4 0.4 0.3 0.3 0.5

N <- 1e5
x <- seq(N)
x.A <- sample(x,N*.5)
x.B <- sample(x[-x.A],N*.35)
x.C <- sample(x[-c(x.A,x.B)],N*.1)
x.D <- sample(x[-c(x.A,x.B,x.C)],N*.05)
lookup <- list(A=x.A, B=x.B, C=x.C, D=x.D)
map <- list(A=50,B=35,C=10,D=5)

f1 <- function(x, lookup, map) {
  lookup2 <- lapply(lookup, function(u) u[u %in% x])
  y <- x
  y[unlist(lookup2)] <- rep(unlist(map), lengths(lookup2))
  y
}

f2 <- function(x, lookup, map)  {
  y <- x
  for(nm in names(lookup)) y[intersect(lookup[[nm]], x)] <- map[[nm]]
  y
}

f3 <- function(x, lookup, map) {
  y <- unlist(lookup, use.names = FALSE)
  y[y] <- rep.int(unlist(map, use.names = FALSE), lengths(lookup))[y %in% x]
  y
}

microbenchmark::microbenchmark(f1 = f1(x, lookup, map),
                               f2 = f2(x, lookup, map),
                               f3 = f3(x, lookup, map),
                               check = "identical")
#> Unit: milliseconds
#>  expr      min        lq      mean    median       uq      max neval
#>    f1 121.4449 129.71180 135.94304 133.56695 139.6493 254.2026   100
#>    f2  88.0754  92.04575  98.49070  98.13595 103.2960 122.0160   100
#>    f3  33.2488  34.06970  37.24117  34.88195  40.4042  52.7759   100
  • Related