I am trying to define a function that takes a data frame or table as input with a specific number of ID columns (e.g., 2 or 3 ID columns), and the remaining columns are NAME1, NAME2, ..., NAMEK (numeric columns). The output should be a data table that consists of the same ID columns as before plus one additional ID column that groups each unique pairwise combination of the column names (NAME1, NAME2, ...). In addition, we must gather the actual values of the numeric columns into two new columns based on the ID column; an example with two ID columns and three numeric columns:
ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DT <- data.table(ID1,ID2,NAME1,NAME2,NAME3)
I want the output of the function with DT as input to be
ID.new <- c("NAME1 - NAME2","NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME2",
"NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME3", "NAME1 - NAME3",
"NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3",
"NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3",
"NAME2 - NAME3", "NAME2 - NAME3")
ID1 <- c("A","A","A","B","B","B","A","A","A","B","B","B","A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
value.left <- c(10,11,9,22,25,22,10,11,9,22,25,22,7,9,8,20,22,21)
value.right <- c(7,9,8,20,22,21,10,12,11,15,19,30,10,12,11,15,19,30)
DT.output <- data.table(ID.new,ID1,ID2,value.left,value.right)
I have found that fun() (see below) does the job, but is too slow for my liking:
fun <- function(data, ID.cols){
data <- data.table(data)
# Which of the columns are ID columns
ids <- which(colnames(data) %in% ID.cols)
# Obtain all pairwise combinations of numeric columns into a list
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
# bind the ID columns to each pairwise combination of numeric columns inside the list
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
# Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
return(rbindlist(l=generalize))
}
# Performance
print(microbenchmark(fun(DT,ID.cols=c("ID1","ID2")),times=1000))
Is there a faster and more elegant way to do this?
CodePudding user response:
You can turn DT
into long format by melt
firstly.
Then to shift
value with the step -nrow(DT)
in order to do
the minus operation, i.e. NAME1 - NAME2, NAME2 - NAME3, NAME3 - NAME1
.
Finally, we need to correct NAME3 - NAME1
to NAME1 - NAME3
.
ds = melt(DT,
id.vars = c("ID1","ID2"),
measure.vars = c("NAME1","NAME2","NAME3"),
value.name = c("value.left")
)
group_len = nrow(DT)
ds[, value.right := shift(value.left, n = -group_len, type = c("cyclic"))]
ds[, ID.new := value.left - value.right]
# NAME3 - NAME1 to NAME1 - NAME3
ds[variable %in% c("NAME3"), value.left := value.left - ID.new]
ds[variable %in% c("NAME3"), value.right := value.right ID.new]
ds[variable %in% c("NAME3"), ID.new := - ID.new]
ds[,.(ID.new, ID1, ID2, value.left, value.right)]
output:
ID.new ID1 ID2 value.left value.right
<num> <char> <num> <num> <num>
1: 3 A 1 10 7
2: 2 A 2 11 9
3: 1 A 3 9 8
4: 2 B 1 22 20
5: 3 B 2 25 22
6: 1 B 3 22 21
7: -3 A 1 7 10
8: -3 A 2 9 12
9: -3 A 3 8 11
10: 5 B 1 20 15
11: 3 B 2 22 19
12: -9 B 3 21 30
13: 0 A 1 10 10
14: -1 A 2 11 12
15: -2 A 3 9 11
16: 7 B 1 22 15
17: 6 B 2 25 19
18: -8 B 3 22 30
CodePudding user response:
I think the approach using combn()
seems reasonable to me. Here your function modified to a base R version. It should be scalable to k columns.
fun2 <- function(data, ID.cols){
ids <- which(colnames(data) %in% ID.cols)
## you can loop over the combinations directly
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
## use paste with collapse for the ID.new
new_x$ID.new <- paste(names(x), collapse = " - ")
# cbind(new_x, data[ids])
new_x
}, simplify = FALSE)
## bind it with the old ID columns, outside the loop (bit faster)
cbind(do.call(rbind, new_dat), data[ids])
}
fun2(DT,ID.cols = c("ID1", "ID2"))
#> value.left value.right ID.new ID1 ID2
#> 1 10 7 NAME1 - NAME2 A 1
#> 2 11 9 NAME1 - NAME2 A 2
#> 3 9 8 NAME1 - NAME2 A 3
#> 4 22 20 NAME1 - NAME2 B 1
#> 5 25 22 NAME1 - NAME2 B 2
#> 6 22 21 NAME1 - NAME2 B 3
#> 7 10 10 NAME1 - NAME3 A 1
#> 8 11 12 NAME1 - NAME3 A 2
#> 9 9 11 NAME1 - NAME3 A 3
#> 10 22 15 NAME1 - NAME3 B 1
#> 11 25 19 NAME1 - NAME3 B 2
#> 12 22 30 NAME1 - NAME3 B 3
#> 13 7 10 NAME2 - NAME3 A 1
#> 14 9 12 NAME2 - NAME3 A 2
#> 15 8 11 NAME2 - NAME3 A 3
#> 16 20 15 NAME2 - NAME3 B 1
#> 17 22 19 NAME2 - NAME3 B 2
#> 18 21 30 NAME2 - NAME3 B 3
Benchmark
microbenchmark::microbenchmark(
pernkf = fun(DT, ID.cols = c("ID1", "ID2")),
tjebo = fun2(DT, ID.cols = c("ID1", "ID2")), times = 1000
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> pernkf 2957.569 3160.1870 4732.249 3338.4275 4138.896 133705.15 1000
#> tjebo 345.292 372.5455 560.661 418.5015 460.802 21592.78 1000
CodePudding user response:
A melted, self-join option:
library(data.table)
DTlong <- melt(DT, id.vars = c("ID1", "ID2"), variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "),
ID1, ID2, value.left = value, value.right = i.value)]
out
# ID.new ID1 ID2 value.left value.right
# <char> <char> <num> <num> <num>
# 1: NAME1 - NAME2 A 1 10 7
# 2: NAME1 - NAME2 A 2 11 9
# 3: NAME1 - NAME2 A 3 9 8
# 4: NAME1 - NAME2 B 1 22 20
# 5: NAME1 - NAME2 B 2 25 22
# 6: NAME1 - NAME2 B 3 22 21
# 7: NAME1 - NAME3 A 1 10 10
# 8: NAME2 - NAME3 A 1 7 10
# 9: NAME1 - NAME3 A 2 11 12
# 10: NAME2 - NAME3 A 2 9 12
# 11: NAME1 - NAME3 A 3 9 11
# 12: NAME2 - NAME3 A 3 8 11
# 13: NAME1 - NAME3 B 1 22 15
# 14: NAME2 - NAME3 B 1 20 15
# 15: NAME1 - NAME3 B 2 25 19
# 16: NAME2 - NAME3 B 2 22 19
# 17: NAME1 - NAME3 B 3 22 30
# 18: NAME2 - NAME3 B 3 21 30
### validation
setorder(out, ID.new, ID1, ID2)
identical(DT.output, out)
# [1] TRUE
Benchmarking:
bench::mark(
pernkf = fun(DT, c("ID1", "ID2")),
tjebo = fun2(DT, c("ID1", "ID2")),
r2evans = fun3(DT, c("ID1", "ID2")),
check = FALSE)
# A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 pernkf 3.04ms 3.59ms 264. 334KB 11.6 114 5 432ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~
# 2 tjebo 3.09ms 3.45ms 279. 230KB 11.0 127 5 455ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~
# 3 r2evans 1.91ms 2.22ms 421. 170KB 8.46 199 4 473ms <NULL> <Rprofmem[,3]~ <bch:tm ~ <tibble [~