Eliminating duplicate rows from multiple dataframes-CodePudding

I have a question about data sorting. In my dataset, there are 5 cell clusters. There is a list of enriched genes for each cluster as separate dataframes. However, some genes are enriched in 2 or more clusters, and I would like to eliminate those genes. How can I do that?

In the simplified example below, rows named “MYH6”, “MYOD” and “GAPDH” should be eliminated because they are in 2 or more dataframes.

I've tried list() but didn't know what to do next. unique() didn't work.

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

Thank you in advance.

CodePudding user response：

Find all objects with names "cluster" and store them into a list.
Extract row names from each cluster, unlist, and find duplicated genes.
Filter out those rows on each cluster where the row names match the duplicated genes.

clust.list <- mget(ls(pattern = 'cluster'))
gene.rm <- names(which(table(unlist(lapply(clust.list, row.names))) > 1))
res.list <- lapply(clust.list, \(x) x[!row.names(x) %in% gene.rm, , drop = FALSE])

res.list
# $cluster1
#       logFC
# ACTA1   0.5
# TNNT2   0.7
# 
# $cluster2
#        logFC
# PECAM1   0.6
# 
# $cluster3
#      logFC
# MYL7     2
# CD34   1.5

(Optional) If you want to overwrite all clusters from res.list to the global environment
```
list2env(res.list, .GlobalEnv)
```

CodePudding user response：

Here a solution filtering duplicated:

library(dplyr)

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

row_names <- unlist(list(rownames(cluster1), rownames(cluster2), rownames(cluster3)))
row_names <- row_names[which(!(duplicated(row_names) | duplicated(row_names, fromLast = TRUE)))]

cluster1 <- cluster1 %>% filter(row.names(cluster1) %in% row_names)
cluster2 <- cluster2 %>% filter(row.names(cluster2) %in% row_names)
cluster3 <- cluster3 %>% filter(row.names(cluster3) %in% row_names)

clusters <- rbind(cluster1, cluster2, cluster3)

Output:

> clusters
       logFC
ACTA1    0.5
TNNT2    0.7
PECAM1   0.6
MYL7       2
CD34     1.5

If you only need your original three dataframes, just don´t apply the final rbind

> cluster1
      logFC
ACTA1   0.5
TNNT2   0.7
> cluster2
      logFC
PECAM1  0.6
> cluster3
     logFC
MYL7     2
CD34   1.5

CodePudding user response：

Please try the below code, where i update the individual dataframes to have a variable grp, so for cluster1 the grp=1 and for cluster2 grp=2 and so on. we can then use the grp to separate the dataframes

data

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"), grp=1)
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"), grp=2)
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"), grp=3)
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

library(tidyverse)


cluster_all <- bind_rows(cluster1,cluster2,cluster3) %>% rownames_to_column() %>% 
  mutate(rowname=str_replace(rowname,'\\. \\d ','')) %>% arrange(rowname) %>% 
  group_by(rowname) %>% mutate(cnt=n()) %>% filter(cnt<=1)


for (i in c('cluster1','cluster2','cluster3')){
  j <- match(i ,c('cluster1','cluster2','cluster3'))
  cluster_allx <- cluster_all %>% filter(grp==j)
  assign(i,cluster_allx, envir = .GlobalEnv)
}

^{Created on 2023-01-19 with reprex v2.0.2}

# A tibble: 2 × 4
# Groups:   rowname [2]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 ACTA1   0.5       1     1
2 TNNT2   0.7       1     1


# A tibble: 1 × 4
# Groups:   rowname [1]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 PECAM1  0.6       2     1


# A tibble: 2 × 4
# Groups:   rowname [2]
  rowname logFC   grp   cnt
  <chr>   <chr> <dbl> <int>
1 CD34    1.5       3     1
2 MYL7    2         3     1

CodePudding user response：

Tidyverse solution with mapping and splitting:

library(purrr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(tibble, warn.conflicts = FALSE)
library(magrittr, warn.conflicts = FALSE)

cluster1 <- data.frame(logFC  = c("1", "0.5", "0.7","0.5"))
rownames(cluster1) <- c("MYH6","ACTA1","TNNT2","GAPDH")

cluster2 <- data.frame(logFC  = c("1", "0.8", "0.6","1.2"))
rownames(cluster2) <- c("MYH6","MYOD","PECAM1","GAPDH")

cluster3 <- data.frame(logFC  = c("2", "0.9", "1.5","0.7"))
rownames(cluster3) <- c("MYL7","MYOD","CD34","GAPDH")

# Get all dataframes
ls(pattern = 'cluster') %>%
  # Add their names
  set_names(., .) %>%
  # Load them
  map(get) %>%
  # Move rownames to column
  map(rownames_to_column) %>%
  # Add column for which dataset each row comes from
  imap(mutate) %>% 
  map(rename, dataframe = last_col()) %>% 
  # Bind them as a single df
  bind_rows() %>% 
  # Count duplicated rows
  add_count(rowname) %>% 
  # Only keep unique rows
  filter(n == 1) %>% 
  # Split data back to list
  split.data.frame(.$dataframe) %>% 
  # Put column back to rownames
  map(~ set_rownames(.x, .x$rowname)) %>% 
  # Remove auxilary columns
  map(select, -c(dataframe, n, rowname)) %>% 
  # Overwrite original data
  list2env(.GlobalEnv)
#> <environment: R_GlobalEnv>

# Print results
ls(pattern = 'cluster') %>% 
  set_names(., .) %>%
  map(get) 
#> $cluster1
#>       logFC
#> ACTA1   0.5
#> TNNT2   0.7
#> 
#> $cluster2
#>        logFC
#> PECAM1   0.6
#> 
#> $cluster3
#>      logFC
#> MYL7     2
#> CD34   1.5

^{Created on 2023-01-19 with reprex v2.0.2}