I have a boolean matrix with observations in row × features in columns.
> df
TP53 PIK3CA GATA3 MAP3K1 PTEN KMT2C AKT1 NF1 FOXA1 SF3B1
ACYZ FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
CKPD FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
AEXF FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
AGNS FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
AJEH TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
AJYK TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
AKVJ TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
ALGN FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
AMFQ FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
ANPD FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
I want to create a symmetric matrix (features × features) based on the matrix above and the values in the cells are the counts of observations (not binary) that meet the features in both rows and columns (TRUE-TRUE).
> output
TP53 PIK3CA GATA3 MAP3K1 PTEN KMT2C AKT1 NF1 FOXA1 SF3B1
TP53 0 1 0 0 0 0 1 1 0 0
PIK3CA 1 0 0 0 0 0 1 0 0 0
...
data
structure(c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE,
FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE), .Dim = c(10L, 10L), .Dimnames = list(c("ACYZ",
"CKPD", "AEXF", "AGNS", "AJEH", "AJYK", "AKVJ", "ALGN", "AMFQ",
"ANPD"), c("TP53", "PIK3CA", "GATA3", "MAP3K1", "PTEN", "KMT2C",
"AKT1", "NF1", "FOXA1", "SF3B1")))
Thanks.
CodePudding user response:
This is a version in which cells (row, col) = (CKPD, PIK3CA) and (PIK3CA, CKPD) are treated differently resulting in output cells containing either 0 or 1.
library(tidyverse)
data <- structure(c(
FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE,
FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE,
TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE
), .Dim = c(10L, 10L), .Dimnames = list(c(
"ACYZ",
"CKPD", "AEXF", "AGNS", "AJEH", "AJYK", "AKVJ", "ALGN", "AMFQ",
"ANPD"
), c(
"TP53", "PIK3CA", "GATA3", "MAP3K1", "PTEN", "KMT2C",
"AKT1", "NF1", "FOXA1", "SF3B1"
)))
all_features <-
data %>%
rownames() %>%
union(colnames(data))
data %>%
as_tibble(rownames = "from") %>%
pivot_longer(-from, names_to = "to") %>%
group_by(from, to) %>%
mutate(
from = from %>% factor(levels = all_features),
to = to %>% factor(levels = all_features),
value = sum(value)
) %>%
ungroup() %>%
complete(from, to, fill = list(value = 0)) %>%
pivot_wider(names_from = to, values_from = value)
#> # A tibble: 20 × 21
#> from ACYZ CKPD AEXF AGNS AJEH AJYK AKVJ ALGN AMFQ ANPD TP53
#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 ACYZ 0 0 0 0 0 0 0 0 0 0 0
#> 2 CKPD 0 0 0 0 0 0 0 0 0 0 0
#> 3 AEXF 0 0 0 0 0 0 0 0 0 0 0
#> 4 AGNS 0 0 0 0 0 0 0 0 0 0 0
#> 5 AJEH 0 0 0 0 0 0 0 0 0 0 1
#> 6 AJYK 0 0 0 0 0 0 0 0 0 0 1
#> 7 AKVJ 0 0 0 0 0 0 0 0 0 0 1
#> 8 ALGN 0 0 0 0 0 0 0 0 0 0 0
#> 9 AMFQ 0 0 0 0 0 0 0 0 0 0 0
#> 10 ANPD 0 0 0 0 0 0 0 0 0 0 0
#> 11 TP53 0 0 0 0 0 0 0 0 0 0 0
#> 12 PIK3CA 0 0 0 0 0 0 0 0 0 0 0
#> 13 GATA3 0 0 0 0 0 0 0 0 0 0 0
#> 14 MAP3K1 0 0 0 0 0 0 0 0 0 0 0
#> 15 PTEN 0 0 0 0 0 0 0 0 0 0 0
#> 16 KMT2C 0 0 0 0 0 0 0 0 0 0 0
#> 17 AKT1 0 0 0 0 0 0 0 0 0 0 0
#> 18 NF1 0 0 0 0 0 0 0 0 0 0 0
#> 19 FOXA1 0 0 0 0 0 0 0 0 0 0 0
#> 20 SF3B1 0 0 0 0 0 0 0 0 0 0 0
#> # … with 9 more variables: PIK3CA <int>, GATA3 <int>, MAP3K1 <int>, PTEN <int>,
#> # KMT2C <int>, AKT1 <int>, NF1 <int>, FOXA1 <int>, SF3B1 <int>
Created on 2022-03-14 by the reprex package (v2.0.0)
Note
If you want to do co-occurrence (e.g. samples containing both genes) or correlation with numerical outcomes, you want to sum up counts from the cell (a,b) and (b,a). This will result in cells having either 0, 1, or 2. This is not what this answer is about, but the mathematical word symmetric matrix would require this to do so.