Faster Populate Matrix-CodePudding

I have a dataframe that looks like the following:

  Examples Type
1 example1    a
2 example1    b
3 example1    c
4 example1    c
5 example2    c

In a matrix, where rows and columns correspond to each example, I want to calculate the intersection of types between examples.

my_mat <- matrix(0, nrow=length(unique(df$Examples)), ncol=length(unique(df$Examples))) 
rownames(my_mat) <- unique(df$Examples)
colnames(my_mat) <- unique(df$Examples)

The code I currently have is a double for-loop, which is significantly slower at larger scales.

get_intersection <- function(example1, example2) {
  return(length(dplyr::intersect(example1, example2)))
}

for (i in 1:nrow(my_mat)) {
  curr_row <- rownames(my_mat)[i]
  for (j in 1:ncol(my_mat)) {
    curr_col <- colnames(my_mat)[j]
    my_mat[i, j] <- get_intersection(df$Type[which(df$Examples %in% curr_row)], 
                                     df$Type[which(df$Examples %in% curr_col)])
  }
}

How can I use the "apply" methods to accelerate the population of this matrix?

Data

df <- structure(list(Examples = c("example1", "example1", "example1", 
"example1", "example2"), Type = c("a", "b", "c", "c", "c")), class = "data.frame", row.names = c(NA, 
-5L))

CodePudding user response：

Not sure for what you need the matrix, you could use outer to consecutively iterate a function f over the unique values of the "Examples" column.

f <- \(x, y) length(intersect(df[df$Examples == x, 'Type'], df[df$Examples == y, 'Type']))
u <- unique(df$Examples)
outer(u, u, Vectorize(f)) |> `dimnames<-`(list(u, u))
#          example1 example2
# example1        3        1
# example2        1        1

Data:

df <- structure(list(Examples = c("example1", "example1", "example1", 
"example1", "example2"), Type = c("a", "b", "c", "c", "c")), class = "data.frame", row.names = c(NA, 
-5L))

CodePudding user response：

If we pivot the data, we can use matrix multiplication:

library(dplyr)  
library(tidyr)
dfw = df %>%
  unique %>% 
  mutate(n = 1) %>%
  pivot_wider(names_from = Type, values_from = n, values_fill = 0) %>%
  as.data.frame

row.names(dfw) = dfw$Examples
dfm = as.matrix(dfw[-1])
result = dfm %*% t(dfm)
result
#          example1 example2
# example1        3        1
# example2        1        1

CodePudding user response：

I haven't benchmarked it, but this version should be a bit faster:

df <- data.frame(Examples = c('example1', 'example1', 'example1', 'example1', 'example2'), 
                 Type = c('a', 'b', 'c', 'c', 'c'), 
                 stringsAsFactors = FALSE)
examples <- unique(df$Examples)
my_mat <- matrix(0, nrow = length(examples), ncol = length(examples)) 
rownames(my_mat) <- examples 
colnames(my_mat) <- examples
perms <- gtools::permutations(v = examples, 
                              n = length(examples), 
                              r = 2, 
                              repeats.allowed = TRUE)
apply(perms, 1, function(x) {
  result <- intersect(df[ df$Examples == x[ 1 ], 'Type' ], 
                      df[ df$Examples == x[ 2 ], 'Type' ]) |>
    length()
  my_mat[ x[ 1 ], x[ 2 ] ] <<- result
}) |> invisible()
print(df)
print(my_mat)

CodePudding user response：

We can use tcrossprod table

> tcrossprod(table(unique(df)))
          Examples
Examples   example1 example2
  example1        3        1
  example2        1        1

> tcrossprod(table(df) > 0)
          Examples
Examples   example1 example2
  example1        3        1
  example2        1        1