Home > Software design >  How to create a new data table based on pairwise combinations of a subset of column names?
How to create a new data table based on pairwise combinations of a subset of column names?

Time:12-28

I am trying to define a function that takes a data frame or table as input with a specific number of ID columns (e.g., 2 or 3 ID columns), and the remaining columns are NAME1, NAME2, ..., NAMEK (numeric columns). The output should be a data table that consists of the same ID columns as before plus one additional ID column that groups each unique pairwise combination of the column names (NAME1, NAME2, ...). In addition, we must gather the actual values of the numeric columns into two new columns based on the ID column; an example with two ID columns and three numeric columns:

ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DT <- data.table(ID1,ID2,NAME1,NAME2,NAME3)

I want the output of the function with DT as input to be

ID.new <- c("NAME1 - NAME2","NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME2",
 "NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME3", "NAME1 - NAME3",
 "NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3",
 "NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3",
 "NAME2 - NAME3", "NAME2 - NAME3")
ID1 <- c("A","A","A","B","B","B","A","A","A","B","B","B","A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
value.left <- c(10,11,9,22,25,22,10,11,9,22,25,22,7,9,8,20,22,21)
value.right <- c(7,9,8,20,22,21,10,12,11,15,19,30,10,12,11,15,19,30)
DT.output <- data.table(ID.new,ID1,ID2,value.left,value.right)

I have found that fun() (see below) does the job, but is too slow for my liking:

  fun <- function(data, ID.cols){
   data <- data.table(data)
   # Which of the columns are ID columns
   ids <-  which(colnames(data) %in% ID.cols)
   # Obtain all pairwise combinations of numeric columns into a list
   numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
   id.cols <- data[,ids, with = FALSE]
   # bind the ID columns to each pairwise combination of numeric columns inside the list
   bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x)) 
   # Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
   generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x) 
   setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name = 
   'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
   return(rbindlist(l=generalize))
}

# Performance
print(microbenchmark(fun(DT,ID.cols=c("ID1","ID2")),times=1000))

Is there a faster and more elegant way to do this?

CodePudding user response:

You can turn DT into long format by melt firstly. Then to shift value with the step -nrow(DT) in order to do the minus operation, i.e. NAME1 - NAME2, NAME2 - NAME3, NAME3 - NAME1. Finally, we need to correct NAME3 - NAME1 to NAME1 - NAME3.

ds = melt(DT, 
          id.vars = c("ID1","ID2"), 
          measure.vars = c("NAME1","NAME2","NAME3"),
          value.name = c("value.left")
          )

group_len = nrow(DT)
ds[, value.right := shift(value.left, n = -group_len, type = c("cyclic"))]
ds[, ID.new := value.left - value.right]
# NAME3 - NAME1 to NAME1 - NAME3
ds[variable %in% c("NAME3"), value.left := value.left - ID.new]
ds[variable %in% c("NAME3"), value.right := value.right   ID.new]
ds[variable %in% c("NAME3"), ID.new := - ID.new]
ds[,.(ID.new, ID1, ID2, value.left, value.right)]

output:

      ID.new    ID1   ID2 value.left value.right
     <num> <char> <num>      <num>       <num>
 1:      3      A     1         10           7
 2:      2      A     2         11           9
 3:      1      A     3          9           8
 4:      2      B     1         22          20
 5:      3      B     2         25          22
 6:      1      B     3         22          21
 7:     -3      A     1          7          10
 8:     -3      A     2          9          12
 9:     -3      A     3          8          11
10:      5      B     1         20          15
11:      3      B     2         22          19
12:     -9      B     3         21          30
13:      0      A     1         10          10
14:     -1      A     2         11          12
15:     -2      A     3          9          11
16:      7      B     1         22          15
17:      6      B     2         25          19
18:     -8      B     3         22          30

CodePudding user response:

I think the approach using combn() seems reasonable to me. Here your function modified to a base R version. It should be scalable to k columns.

fun2 <- function(data, ID.cols){
  ids <-  which(colnames(data) %in% ID.cols)
  ## you can loop over the combinations directly
  new_dat <- combn(data[-ids], 2, function(x) {
    new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
    ## use paste with collapse for the ID.new
    new_x$ID.new <- paste(names(x), collapse = " - ")
    # cbind(new_x, data[ids])
    new_x
  }, simplify = FALSE)

## bind it with the old ID columns, outside the loop (bit faster)
  cbind(do.call(rbind, new_dat), data[ids])
}

fun2(DT,ID.cols = c("ID1", "ID2"))
#>    value.left value.right        ID.new ID1 ID2
#> 1          10           7 NAME1 - NAME2   A   1
#> 2          11           9 NAME1 - NAME2   A   2
#> 3           9           8 NAME1 - NAME2   A   3
#> 4          22          20 NAME1 - NAME2   B   1
#> 5          25          22 NAME1 - NAME2   B   2
#> 6          22          21 NAME1 - NAME2   B   3
#> 7          10          10 NAME1 - NAME3   A   1
#> 8          11          12 NAME1 - NAME3   A   2
#> 9           9          11 NAME1 - NAME3   A   3
#> 10         22          15 NAME1 - NAME3   B   1
#> 11         25          19 NAME1 - NAME3   B   2
#> 12         22          30 NAME1 - NAME3   B   3
#> 13          7          10 NAME2 - NAME3   A   1
#> 14          9          12 NAME2 - NAME3   A   2
#> 15          8          11 NAME2 - NAME3   A   3
#> 16         20          15 NAME2 - NAME3   B   1
#> 17         22          19 NAME2 - NAME3   B   2
#> 18         21          30 NAME2 - NAME3   B   3

Benchmark

microbenchmark::microbenchmark(
  pernkf = fun(DT, ID.cols = c("ID1", "ID2")),
  tjebo = fun2(DT, ID.cols = c("ID1", "ID2")), times = 1000
)
#> Unit: microseconds
#>    expr      min        lq     mean    median       uq       max neval
#>  pernkf 2957.569 3160.1870 4732.249 3338.4275 4138.896 133705.15  1000
#>   tjebo  345.292  372.5455  560.661  418.5015  460.802  21592.78  1000

CodePudding user response:

A melted, self-join option:

library(data.table)
DTlong <- melt(DT, id.vars = c("ID1", "ID2"), variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
  ][variable < i.variable,
  ][, .(ID.new = paste(variable, i.variable, sep = " - "),
        ID1, ID2, value.left = value, value.right = i.value)]
out
#            ID.new    ID1   ID2 value.left value.right
#            <char> <char> <num>      <num>       <num>
#  1: NAME1 - NAME2      A     1         10           7
#  2: NAME1 - NAME2      A     2         11           9
#  3: NAME1 - NAME2      A     3          9           8
#  4: NAME1 - NAME2      B     1         22          20
#  5: NAME1 - NAME2      B     2         25          22
#  6: NAME1 - NAME2      B     3         22          21
#  7: NAME1 - NAME3      A     1         10          10
#  8: NAME2 - NAME3      A     1          7          10
#  9: NAME1 - NAME3      A     2         11          12
# 10: NAME2 - NAME3      A     2          9          12
# 11: NAME1 - NAME3      A     3          9          11
# 12: NAME2 - NAME3      A     3          8          11
# 13: NAME1 - NAME3      B     1         22          15
# 14: NAME2 - NAME3      B     1         20          15
# 15: NAME1 - NAME3      B     2         25          19
# 16: NAME2 - NAME3      B     2         22          19
# 17: NAME1 - NAME3      B     3         22          30
# 18: NAME2 - NAME3      B     3         21          30

### validation
setorder(out, ID.new, ID1, ID2)
identical(DT.output, out)
# [1] TRUE

Benchmarking:

bench::mark(
  pernkf  = fun(DT, c("ID1", "ID2")),
  tjebo   = fun2(DT, c("ID1", "ID2")),
  r2evans = fun3(DT, c("ID1", "ID2")),
  check = FALSE)
# A tibble: 3 x 13
#   expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory         time      gc        
#   <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>         <list>    <list>    
# 1 pernkf      3.04ms  3.59ms      264.     334KB    11.6    114     5      432ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~
# 2 tjebo       3.09ms  3.45ms      279.     230KB    11.0    127     5      455ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~
# 3 r2evans     1.91ms  2.22ms      421.     170KB     8.46   199     4      473ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~
  • Related