R- convert a list of character vectors into a binary matrix-CodePudding

I am working in R and have a named list of character vectors. Each vector describes the genes present in a biological pathway.

Please see example below:

gene_sets = list(pathwayX= c("Gene3"),pathwayY= c("Gene2", "Gene3"),pathwayz= c("Gene1", "Gene2","Gene3"))

> gene_sets
$pathwayX
"Gene3"

$pathwayY
"Gene2" "Gene3"

$pathwayZ
"Gene1" "Gene2" "Gene3"

What I want to do is to convert the list into a binary matrix, with genes as columns and pathways as rows. 0 means gene is not present in the pathway and 1 means gene is present.

	Gene1	Gene2	Gene3
pathwayX	0	0	1
pathwayY	0	1	1
pathwayZ	1	1	1

CodePudding user response：

We can use

t( sapply(gene_sets, "%in%", x = c("Gene1", "Gene2", "Gene3")))

If you want to dynamically obtain c("Gene1", "Gene2", "Gene3"), we can do

GeneID <- sort(unique(unlist(gene_sets)))

mat <- t( sapply(gene_sets, "%in%", x = GeneID))  ## matrix output
colnames(mat) <- GeneID
#         Gene1 Gene2 Gene3
#pathwayX     0     0     1
#pathwayY     0     1     1
#pathwayz     1     1     1

data.frame(mat)  ## data.frame output

My impression is that genes problem is usually large and sparse. If you have hundreds of thousands of genes and pathways in reality, the following sparse matrix solution is the best option.

pathwayID <- names(gene_sets)
n1 <- lengths(gene_sets, use.names = FALSE)  ## number of genes in each pathway
genesVec <- unlist(gene_sets, use.names = FALSE)
GeneID <- sort(unique(genesVec))
i <- rep(1:length(n1), n1)
j <- match(genesVec, GeneID)
Matrix::sparseMatrix(i = i, j = j, x = rep.int(1, length(i)),
                     dimnames = list(pathwayID, GeneID))
#3 x 3 sparse Matrix of class "dgCMatrix"
#         Gene1 Gene2 Gene3
#pathwayX     .     .     1
#pathwayY     .     1     1
#pathwayz     1     1     1

CodePudding user response：

You could also use mtabulate from qdapTools like this:

gene_sets = list(pathwayX= c("gene3"),pathwayY= c("gene2", "gene3"),pathwayz= c("gene1", "gene2","gene3"))

library(qdapTools)
mtabulate(gene_sets)
#>          gene1 gene2 gene3
#> pathwayX     0     0     1
#> pathwayY     0     1     1
#> pathwayz     1     1     1

^{Created on 2022-07-18 by the reprex package (v2.0.1)}

CodePudding user response：

An idea is to use table in combination with stack i.e.

t(table(stack(gene_sets)))

          values
ind        Gene1 Gene2 Gene3
  pathwayX     0     0     1
  pathwayY     0     1     1
  pathwayz     1     1     1

CodePudding user response：

Here is a try to solve it with tidyverse:

library(dplyr)
library(tibble)
as.data.frame(unlist(gene_sets)) %>% 
  transmute(gene = as.factor(`unlist(gene_sets)`)) %>% 
  rownames_to_column() %>% 
  mutate(rowname = str_remove(rowname, "[0-9]")) %>% 
  cbind((model.matrix(~ gene   0, .) == 1)*1) %>% 
  rename_with(., ~str_replace_all(., "geneGene", "Gene")) %>% 
  group_by(rowname) %>% 
  summarise(across(-gene, ~sum(.)))

  rowname  Gene1 Gene2 Gene3
  <chr>    <dbl> <dbl> <dbl>
1 pathwayX     0     0     1
2 pathwayY     0     1     1
3 pathwayz     1     1     1

CodePudding user response：

Here is a somewhat concise tidyverse way if one were compelled to go down that path.

library(dplyr)
library(purrr)
library(tibble)
library(tidyr)

map_dfr(gene_sets,
        ~ as_tibble_row(set_names(rep(1L, length(.x)), .x)), .id = "row") %>% 
  relocate(row, order(colnames(.))) %>% 
  mutate(across(-1, replace_na, 0))

# A tibble: 3 x 4
  row      Gene1 Gene2 Gene3
  <chr>    <int> <int> <int>
1 pathwayX     0     0     1
2 pathwayY     0     1     1
3 pathwayz     1     1     1