Home > Blockchain >  Merge column if duplicates in rows between columns
Merge column if duplicates in rows between columns

Time:11-21

I have a dataframe such as :

    COL1   COL2  COL3      COL4   COL5   COL6    COL7
1  Sp1-2  Sp1-2  Sp3_2-54  Sp3-2  Sp3-2  Sp3-2   SP9-43
2  Sp5-1  Sp5-2  Sp2-4     Sp9-2  Sp10-3 SP9-90  NA
3  Sp_7-3 Sp_7-3 NA        SP6-56 Sp2-7  SP3-3   NA

And I would simply like to merge columns when at leats two elements are duplicated.

for example, in COL1 and COL2, Sp1-2 & Sp_7-3 are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:

   COL1|COL2     COL3      COL4|COL5|COL6       COL7
1  Sp1-2         Sp3_2-54  Sp3-2                SP9-43
2  Sp5-1|Sp5-2   Sp2-4     Sp9-2|Sp10-3|SP9-90  NA
3  Sp_7-3        NA        SP6-56|Sp2-7|SP3-3   NA

Here is the dput format :

structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2", 
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2", 
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2", 
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA, 
-3L))

Another example :

           G136             G348           G465
1          NA               NA             NA
2          NA               NA             NA
3          SP4-140          SP4-140        NA
4          SP2-8            NA             NA
5          SP3-59           NA             NA
6          SP1_contig.682-8 NA             SP1_contig.682-8

expected output:

           G136|G348|G465
1          NA               
2          NA              
3          SP4-140          
4          SP2-8           
5          SP3-59           
6          SP1_contig.682-8 

the deput format :

dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59",  "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140",  NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8",  NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl",  "data.frame"))

CodePudding user response:

This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:

library(tidyr)
library(dplyr)

dat %>%
  rowid_to_column() %>%
  pivot_longer(-rowid) %>%
  filter(!is.na(value)) %>%
  group_by(rowid, value) %>%
  mutate(new_name = paste(name, collapse = "|")) %>%
  separate_rows(new_name, sep = "\\|") %>%
  group_by(name) %>%
  mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
  group_by(value) %>%
  filter(nchar(new_name) == max(nchar(new_name))) %>%
  ungroup() %>%
  select(-name) %>%
  pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
  complete(rowid = full_seq(c(1, rowid), 1))

# A tibble: 3 × 5
  rowid `COL1|COL2` COL3     `COL4|COL5|COL6`    COL7  
  <dbl> <chr>       <chr>    <chr>               <chr> 
1     1 Sp1-2       Sp3_2-54 Sp3-2               SP9-43
2     2 Sp5-1|Sp5-2 Sp2-4    Sp9-2|Sp10-3|SP9-90 NA    
3     3 Sp_7-3      NA       SP6-56|Sp2-7|SP3-3  NA     

And using the data in your second example gives:

# A tibble: 6 × 2
  rowid `G136|G348|G465`
  <dbl> <chr>           
1     1 NA              
2     2 NA              
3     3 SP4-140         
4     4 SP2-8           
5     5 SP3-59          
6     6 SP1_contig.682-8

CodePudding user response:

It's really messy...but you may try

library(igraph)
library(stringdist)
library(data.table)


table(df[1,])

d <- c()
for (i in 1:(ncol(df)-1)){
  for (j in (i 1):ncol(df)) {
    if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
      d <- rbind(d, c(i,j))
    }
  }
}
dd <- data.table(d)

net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
  i <- as.numeric(i)
  names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
  df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
  
  
}
colnames(df_dummy) <- names_dummy
df_dummy



res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res

    COL1|COL2     COL3      COL4|COL5|COL6   COL7
1       Sp1-2 Sp3_2-54               Sp3-2 SP9-43
2 Sp5-1|Sp5-2    Sp2-4 Sp9-2|Sp10-3|SP9-90   <NA>
3      Sp_7-3     <NA>  SP6-56|Sp2-7|SP3-3   <NA>
  • Related