Objective To see the common elements that is my row which are basically gene name in my different comparisons.
This was the answer which I tried to follow.
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene3', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
reduce(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
genes
1 gene3
This fails in this case
df1 = data.frame(genes = c('gene1', 'gene3', 'gene4', 'gene2'))
df2 = data.frame(genes = c('gene3', 'gene2', 'gene5', 'gene1', "genet"))
df3 = data.frame(genes = c('gene6', 'gene13', 'gene4', 'gdene7', 'genex', "gene10"))
dfList <- list(df1, df2, df3)
reduce(dfList, inner_join)
educe(dfList, inner_join)
Joining, by = "genes"
Joining, by = "genes"
[1] genes
<0 rows> (or 0-length row.names)
Now how to address this problem. I gave a small set I have like 15 comparison.
Expected output
gene3 df1 df2 df3 ## for common genes
gene1 df1 df2 ## for genes which arr not across all the combination
gene2
In the first case the solution works as the gene3 is preset in all the case but fails when it is present in only 2 condition.
So how do I find out all the possible combination where the genes are present in different possible combination.
For example if gene3 is present in all three so it is reported but gene1 and gene2 are present in df1
and df2
but these are not reported.
So I would like to see if a group of genes present in all condition which is not possible most likely but all the possible combination where its is present
My actual dataframes are named as such which is in a list
names(result_abd)
[1] "M0_vs_M1_TCGA_stages" "M0_vs_M2_TCGA_stages" "M0_vs_M3_TCGA_stages" "M0_vs_M4_TCGA_stages" "M0_vs_M5_TCGA_stages" "M1_vs_M2_TCGA_stages"
[7] "M1_vs_M3_TCGA_stages" "M1_vs_M4_TCGA_stages" "M1_vs_M5_TCGA_stages" "M2_vs_M3_TCGA_stages" "M2_vs_M4_TCGA_stages" "M2_vs_M5_TCGA_stages"
[13] "M3_vs_M4_TCGA_stages" "M3_vs_M5_TCGA_stages" "M4_vs_M5_TCGA_stages"
>
So I would have like 15 columns for each dataframe
I ran your code the output is as such
dput(head(a))
structure(list(gene = c("ENSG00000000003", "ENSG00000000971",
"ENSG00000002726", "ENSG00000003989", "ENSG00000005381", "ENSG00000006534"
), dfM0_vs_M1_TCGA_stages = c("M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages", "M0_vs_M1_TCGA_stages",
"M0_vs_M1_TCGA_stages"), dfM0_vs_M2_TCGA_stages = c(NA, "M0_vs_M2_TCGA_stages",
"M0_vs_M2_TCGA_stages", NA, "M0_vs_M2_TCGA_stages", NA), dfM0_vs_M3_TCGA_stages = c("M0_vs_M3_TCGA_stages",
"M0_vs_M3_TCGA_stages", "M0_vs_M3_TCGA_stages", NA, "M0_vs_M3_TCGA_stages",
NA), dfM0_vs_M4_TCGA_stages = c("M0_vs_M4_TCGA_stages", NA, "M0_vs_M4_TCGA_stages",
NA, "M0_vs_M4_TCGA_stages", "M0_vs_M4_TCGA_stages"), dfM0_vs_M5_TCGA_stages = c("M0_vs_M5_TCGA_stages",
NA, "M0_vs_M5_TCGA_stages", NA, "M0_vs_M5_TCGA_stages", "M0_vs_M5_TCGA_stages"
), dfM1_vs_M2_TCGA_stages = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), dfM1_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M1_vs_M3_TCGA_stages", NA), dfM1_vs_M4_TCGA_stages = c(NA,
"M1_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM1_vs_M5_TCGA_stages = c(NA,
NA, "M1_vs_M5_TCGA_stages", NA, NA, NA), dfM2_vs_M3_TCGA_stages = c(NA,
NA, NA, NA, "M2_vs_M3_TCGA_stages", NA), dfM2_vs_M4_TCGA_stages = c(NA,
"M2_vs_M4_TCGA_stages", NA, NA, NA, NA), dfM2_vs_M5_TCGA_stages = c(NA,
NA, "M2_vs_M5_TCGA_stages", NA, "M2_vs_M5_TCGA_stages", NA),
dfM3_vs_M4_TCGA_stages = c(NA, "M3_vs_M4_TCGA_stages", NA,
NA, "M3_vs_M4_TCGA_stages", NA), dfM3_vs_M5_TCGA_stages = c(NA,
"M3_vs_M5_TCGA_stages", NA, NA, "M3_vs_M5_TCGA_stages", NA
), dfM4_vs_M5_TCGA_stages = c(NA, NA, "M4_vs_M5_TCGA_stages",
NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Dataframe format
A tibble: 6 × 16
gene dfM0_vs_M1_TCGA… dfM0_vs_M2_TCGA… dfM0_vs_M3_TCGA… dfM0_vs_M4_TCGA… dfM0_vs_M5_TCGA… dfM1_vs_M2_TCGA… dfM1_vs_M3_TCGA… dfM1_vs_M4_TCGA…
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ENSG00000000003 M0_vs_M1_TCGA_s… NA M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
2 ENSG00000000971 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… NA NA NA NA M1_vs_M4_TCGA_s…
3 ENSG00000002726 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
4 ENSG00000003989 M0_vs_M1_TCGA_s… NA NA NA NA NA NA NA
5 ENSG00000005381 M0_vs_M1_TCGA_s… M0_vs_M2_TCGA_s… M0_vs_M3_TCGA_s… M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA M1_vs_M3_TCGA_s… NA
6 ENSG00000006534 M0_vs_M1_TCGA_s… NA NA M0_vs_M4_TCGA_s… M0_vs_M5_TCGA_s… NA NA NA
Now this is what i wanted. The next step as an example I would like to see
If I take this gene ENSG00000000971
is present in 7 comparison but not in others where its is reported as NA
.How do I group them.
Like making another data frame with those genes lets say are present in multiple comparison and not include wherever this is NA
CodePudding user response:
It's not clear to me exactly how you want your ourput formatted (several index columns or one column containing a string or a list column). But here's one option. I start by combining your list of data fames into a single data frame, with an index indicating the source.
library(tidyverse)
dfList <- list(df1, df2, df3)
dfList %>%
bind_rows(.id="df") %>%
pivot_wider(names_from=df, names_prefix="df", values_from=df)
# A tibble: 10 × 4
genes df1 df2 df3
<chr> <chr> <chr> <chr>
1 gene1 1 2 NA
2 gene3 1 2 3
3 gene4 1 NA 3
4 gene2 1 2 NA
5 gene5 NA 2 NA
6 genet NA 2 NA
7 gene6 NA NA 3
8 gdene7 NA NA 3
9 genex NA NA 3
10 gene10 NA NA 3
Addition in response to OP's question below. (Though note that's actually a new question and really should be a new post.)
dfList %>%
bind_rows(.id="df") %>%
group_by(genes) %>%
summarise(minDF=min(df), maxDF=max(df)) %>%
filter(minDF == maxDF & maxDF == 3) %>%
pull(genes)
[1] "gdene7" "gene10" "gene6" "genex"
Once again, the key is to put all the data into a single data frame. (And the desired format of the output is not clear.)