I have a dataframe such as :
COL1 COL2 COL3 COL4 COL5 COL6 COL7
1 Sp1-2 Sp1-2 Sp3_2-54 Sp3-2 Sp3-2 Sp3-2 SP9-43
2 Sp5-1 Sp5-2 Sp2-4 Sp9-2 Sp10-3 SP9-90 NA
3 Sp_7-3 Sp_7-3 NA SP6-56 Sp2-7 SP3-3 NA
And I would simply like to merge columns when at leats two elements are duplicated.
for example, in COL1
and COL2
, Sp1-2
& Sp_7-3
are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
Here is the dput format :
structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2",
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2",
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2",
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Another example :
G136 G348 G465
1 NA NA NA
2 NA NA NA
3 SP4-140 SP4-140 NA
4 SP2-8 NA NA
5 SP3-59 NA NA
6 SP1_contig.682-8 NA SP1_contig.682-8
expected output:
G136|G348|G465
1 NA
2 NA
3 SP4-140
4 SP2-8
5 SP3-59
6 SP1_contig.682-8
the deput format :
dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59", "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140", NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8", NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
CodePudding user response:
This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:
library(tidyr)
library(dplyr)
dat %>%
rowid_to_column() %>%
pivot_longer(-rowid) %>%
filter(!is.na(value)) %>%
group_by(rowid, value) %>%
mutate(new_name = paste(name, collapse = "|")) %>%
separate_rows(new_name, sep = "\\|") %>%
group_by(name) %>%
mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
group_by(value) %>%
filter(nchar(new_name) == max(nchar(new_name))) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
complete(rowid = full_seq(c(1, rowid), 1))
# A tibble: 3 × 5
rowid `COL1|COL2` COL3 `COL4|COL5|COL6` COL7
<dbl> <chr> <chr> <chr> <chr>
1 1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
And using the data in your second example gives:
# A tibble: 6 × 2
rowid `G136|G348|G465`
<dbl> <chr>
1 1 NA
2 2 NA
3 3 SP4-140
4 4 SP2-8
5 5 SP3-59
6 6 SP1_contig.682-8
CodePudding user response:
It's really messy...but you may try
library(igraph)
library(stringdist)
library(data.table)
table(df[1,])
d <- c()
for (i in 1:(ncol(df)-1)){
for (j in (i 1):ncol(df)) {
if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
d <- rbind(d, c(i,j))
}
}
}
dd <- data.table(d)
net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
i <- as.numeric(i)
names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
}
colnames(df_dummy) <- names_dummy
df_dummy
res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 <NA>
3 Sp_7-3 <NA> SP6-56|Sp2-7|SP3-3 <NA>