I have a question about data sorting. In my dataset, there are 5 cell clusters. There is a list of enriched genes for each cluster as separate dataframes. However, some genes are enriched in 2 or more clusters, and I would like to eliminate those genes. How can I do that?
In the simplified example below, rows named “MYH6”, “MYOD” and “GAPDH” should be eliminated because they are in 2 or more dataframes.
I've tried list() but didn't know what to do next. unique() didn't work.
cluster1 <- data.frame(logFC = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")
cluster2 <- data.frame(logFC = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")
cluster3 <- data.frame(logFC = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")
Thank you in advance.
CodePudding user response:
- Find all objects with names "cluster" and store them into a list.
- Extract row names from each cluster, unlist, and find duplicated genes.
- Filter out those rows on each cluster where the row names match the duplicated genes.
clust.list <- mget(ls(pattern = 'cluster'))
gene.rm <- names(which(table(unlist(lapply(clust.list, row.names))) > 1))
res.list <- lapply(clust.list, \(x) x[!row.names(x) %in% gene.rm, , drop = FALSE])
res.list
# $cluster1
# logFC
# ACTA1 0.5
# TNNT2 0.7
#
# $cluster2
# logFC
# PECAM1 0.6
#
# $cluster3
# logFC
# MYL7 2
# CD34 1.5
(Optional) If you want to overwrite all clusters from
res.list
to the global environmentlist2env(res.list, .GlobalEnv)
CodePudding user response:
Here a solution filtering duplicated:
library(dplyr)
cluster1 <- data.frame(logFC = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")
cluster2 <- data.frame(logFC = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")
cluster3 <- data.frame(logFC = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")
row_names <- unlist(list(rownames(cluster1), rownames(cluster2), rownames(cluster3)))
row_names <- row_names[which(!(duplicated(row_names) | duplicated(row_names, fromLast = TRUE)))]
cluster1 <- cluster1 %>% filter(row.names(cluster1) %in% row_names)
cluster2 <- cluster2 %>% filter(row.names(cluster2) %in% row_names)
cluster3 <- cluster3 %>% filter(row.names(cluster3) %in% row_names)
clusters <- rbind(cluster1, cluster2, cluster3)
Output:
> clusters
logFC
ACTA1 0.5
TNNT2 0.7
PECAM1 0.6
MYL7 2
CD34 1.5
If you only need your original three dataframes, just don´t apply the final rbind
> cluster1
logFC
ACTA1 0.5
TNNT2 0.7
> cluster2
logFC
PECAM1 0.6
> cluster3
logFC
MYL7 2
CD34 1.5
CodePudding user response:
Please try the below code, where i update the individual dataframes to have a variable grp, so for cluster1
the grp=1
and for cluster2
grp=2
and so on. we can then use the grp to separate the dataframes
data
cluster1 <- data.frame(logFC = c("1", "0.5", "0.7","0.5"), grp=1)
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")
cluster2 <- data.frame(logFC = c("1", "0.8", "0.6","1.2"), grp=2)
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")
cluster3 <- data.frame(logFC = c("2", "0.9", "1.5","0.7"), grp=3)
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")
library(tidyverse)
cluster_all <- bind_rows(cluster1,cluster2,cluster3) %>% rownames_to_column() %>%
mutate(rowname=str_replace(rowname,'\\. \\d ','')) %>% arrange(rowname) %>%
group_by(rowname) %>% mutate(cnt=n()) %>% filter(cnt<=1)
for (i in c('cluster1','cluster2','cluster3')){
j <- match(i ,c('cluster1','cluster2','cluster3'))
cluster_allx <- cluster_all %>% filter(grp==j)
assign(i,cluster_allx, envir = .GlobalEnv)
}
Created on 2023-01-19 with reprex v2.0.2
# A tibble: 2 × 4
# Groups: rowname [2]
rowname logFC grp cnt
<chr> <chr> <dbl> <int>
1 ACTA1 0.5 1 1
2 TNNT2 0.7 1 1
# A tibble: 1 × 4
# Groups: rowname [1]
rowname logFC grp cnt
<chr> <chr> <dbl> <int>
1 PECAM1 0.6 2 1
# A tibble: 2 × 4
# Groups: rowname [2]
rowname logFC grp cnt
<chr> <chr> <dbl> <int>
1 CD34 1.5 3 1
2 MYL7 2 3 1
CodePudding user response:
Tidyverse solution with mapping and splitting:
library(purrr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(tibble, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)
cluster1 <- data.frame(logFC = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")
cluster2 <- data.frame(logFC = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")
cluster3 <- data.frame(logFC = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")
# Get all dataframes
ls(pattern = 'cluster') %>%
# Add their names
set_names(., .) %>%
# Load them
map(get) %>%
# Move rownames to column
map(rownames_to_column) %>%
# Add column for which dataset each row comes from
imap(mutate) %>%
map(rename, dataframe = last_col()) %>%
# Bind them as a single df
bind_rows() %>%
# Count duplicated rows
add_count(rowname) %>%
# Only keep unique rows
filter(n == 1) %>%
# Split data back to list
split.data.frame(.$dataframe) %>%
# Put column back to rownames
map(~ set_rownames(.x, .x$rowname)) %>%
# Remove auxilary columns
map(select, -c(dataframe, n, rowname)) %>%
# Overwrite original data
list2env(.GlobalEnv)
#> <environment: R_GlobalEnv>
# Print results
ls(pattern = 'cluster') %>%
set_names(., .) %>%
map(get)
#> $cluster1
#> logFC
#> ACTA1 0.5
#> TNNT2 0.7
#>
#> $cluster2
#> logFC
#> PECAM1 0.6
#>
#> $cluster3
#> logFC
#> MYL7 2
#> CD34 1.5
Created on 2023-01-19 with reprex v2.0.2