I have a dataframe that looks like the following:
Examples Type
1 example1 a
2 example1 b
3 example1 c
4 example1 c
5 example2 c
In a matrix, where rows and columns correspond to each example, I want to calculate the intersection of types between examples.
my_mat <- matrix(0, nrow=length(unique(df$Examples)), ncol=length(unique(df$Examples)))
rownames(my_mat) <- unique(df$Examples)
colnames(my_mat) <- unique(df$Examples)
The code I currently have is a double for-loop, which is significantly slower at larger scales.
get_intersection <- function(example1, example2) {
return(length(dplyr::intersect(example1, example2)))
}
for (i in 1:nrow(my_mat)) {
curr_row <- rownames(my_mat)[i]
for (j in 1:ncol(my_mat)) {
curr_col <- colnames(my_mat)[j]
my_mat[i, j] <- get_intersection(df$Type[which(df$Examples %in% curr_row)],
df$Type[which(df$Examples %in% curr_col)])
}
}
How can I use the "apply" methods to accelerate the population of this matrix?
Data
df <- structure(list(Examples = c("example1", "example1", "example1",
"example1", "example2"), Type = c("a", "b", "c", "c", "c")), class = "data.frame", row.names = c(NA,
-5L))
CodePudding user response:
Not sure for what you need the matrix, you could use outer
to consecutively iterate a function f
over the unique
values of the "Examples"
column.
f <- \(x, y) length(intersect(df[df$Examples == x, 'Type'], df[df$Examples == y, 'Type']))
u <- unique(df$Examples)
outer(u, u, Vectorize(f)) |> `dimnames<-`(list(u, u))
# example1 example2
# example1 3 1
# example2 1 1
Data:
df <- structure(list(Examples = c("example1", "example1", "example1",
"example1", "example2"), Type = c("a", "b", "c", "c", "c")), class = "data.frame", row.names = c(NA,
-5L))
CodePudding user response:
If we pivot the data, we can use matrix multiplication:
library(dplyr)
library(tidyr)
dfw = df %>%
unique %>%
mutate(n = 1) %>%
pivot_wider(names_from = Type, values_from = n, values_fill = 0) %>%
as.data.frame
row.names(dfw) = dfw$Examples
dfm = as.matrix(dfw[-1])
result = dfm %*% t(dfm)
result
# example1 example2
# example1 3 1
# example2 1 1
CodePudding user response:
I haven't benchmarked it, but this version should be a bit faster:
df <- data.frame(Examples = c('example1', 'example1', 'example1', 'example1', 'example2'),
Type = c('a', 'b', 'c', 'c', 'c'),
stringsAsFactors = FALSE)
examples <- unique(df$Examples)
my_mat <- matrix(0, nrow = length(examples), ncol = length(examples))
rownames(my_mat) <- examples
colnames(my_mat) <- examples
perms <- gtools::permutations(v = examples,
n = length(examples),
r = 2,
repeats.allowed = TRUE)
apply(perms, 1, function(x) {
result <- intersect(df[ df$Examples == x[ 1 ], 'Type' ],
df[ df$Examples == x[ 2 ], 'Type' ]) |>
length()
my_mat[ x[ 1 ], x[ 2 ] ] <<- result
}) |> invisible()
print(df)
print(my_mat)
CodePudding user response:
We can use tcrossprod
table
> tcrossprod(table(unique(df)))
Examples
Examples example1 example2
example1 3 1
example2 1 1
or
> tcrossprod(table(df) > 0)
Examples
Examples example1 example2
example1 3 1
example2 1 1