i have a dataframe with genes like this :
genes padj
ENSG00000106714 3.43056474068292E-05
ENSG00000165115 9.165798954677E-05
ENSG00000188056 0.000846081653952
ENSG00000254340 0.003678623922293
ENSG00000196083 0.007632605419062
ENSG00000236438 0.007632605419062
ENSG00000223401 0.009219856694741
ENSG00000233013 0.010086733081366
ENSG00000214077 0.009703019961401
and another one like this
genes padj
ENSG00000165115 2.0309880524746E-07
ENSG00000106714 8.28506484840271E-07
ENSG00000111087 3.65536414358691E-06
ENSG00000257743 2.38040674140299E-05
ENSG00000261804 6.68330615861441E-05
ENSG00000251260 0.000868563126637
ENSG00000223401 0.000868563126637
ENSG00000089505 0.001233018119198
ENSG00000213934 0.001735780758384
and i would like to create a new data frame in which i will have the common genes between the two files , the unique from file 1 and the unique from file 2.
CodePudding user response:
You can use dplyr
library to create a new data frame containing common genes with unique padj
with it.
Here's how you'll do it
library(dplyr)
df <- inner_join(genes1, genes2, by = 'genes')
df
The output will look like this:
genes padj.x padj.y
1 ENSG00000106714 0.000034300 0.000000829
2 ENSG00000165115 0.000091700 0.000000203
3 ENSG00000223401 0.009219857 0.000868563
And if you want all the genes
to show up in your data frame along with respective padj
you can use full_join
instead of inner_join
.
df <- full_join(genes1, genes2, by = 'genes')
df
The output of full_join
looks like this:
genes padj.x padj.y
1 ENSG00000106714 0.000034300 0.000000829
2 ENSG00000165115 0.000091700 0.000000203
3 ENSG00000188056 0.000846082 NA
4 ENSG00000254340 0.003678624 NA
5 ENSG00000196083 0.007632605 NA
6 ENSG00000236438 0.007632605 NA
7 ENSG00000223401 0.009219857 0.000868563
8 ENSG00000233013 0.010086733 NA
9 ENSG00000214077 0.009703020 NA
10 ENSG00000111087 NA 0.000003660
11 ENSG00000257743 NA 0.000023800
12 ENSG00000261804 NA 0.000066800
13 ENSG00000251260 NA 0.000868563
14 ENSG00000089505 NA 0.001233018
15 ENSG00000213934 NA 0.001735781
CodePudding user response:
Using merge()
and subsetting after.
(df_all <- merge(df_x, df_y, by="genes", all=TRUE)) ## complete gene set
# genes padj.x padj.y
# 1 ENSG00000089505 NA 1.233018e-03
# 2 ENSG00000106714 3.430565e-05 8.285065e-07
# 3 ENSG00000111087 NA 3.655364e-06
# 4 ENSG00000165115 9.165799e-05 2.030988e-07
# 5 ENSG00000188056 8.460817e-04 NA
# 6 ENSG00000196083 7.632605e-03 NA
# 7 ENSG00000213934 NA 1.735781e-03
# 8 ENSG00000214077 9.703020e-03 NA
# 9 ENSG00000223401 9.219857e-03 8.685631e-04
# 10 ENSG00000233013 1.008673e-02 NA
# 11 ENSG00000236438 7.632605e-03 NA
# 12 ENSG00000251260 NA 8.685631e-04
# 13 ENSG00000254340 3.678624e-03 NA
# 14 ENSG00000257743 NA 2.380407e-05
# 15 ENSG00000261804 NA 6.683306e-05
df_all[is.na(df_all$padj.y), ] ## complete x set
# genes padj.x padj.y
# 5 ENSG00000188056 0.0008460817 NA
# 6 ENSG00000196083 0.0076326054 NA
# 8 ENSG00000214077 0.0097030200 NA
# 10 ENSG00000233013 0.0100867331 NA
# 11 ENSG00000236438 0.0076326054 NA
# 13 ENSG00000254340 0.0036786239 NA
df_all[is.na(df_all$padj.x), ] ## complete y set
# genes padj.x padj.y
# 1 ENSG00000089505 NA 1.233018e-03
# 3 ENSG00000111087 NA 3.655364e-06
# 7 ENSG00000213934 NA 1.735781e-03
# 12 ENSG00000251260 NA 8.685631e-04
# 14 ENSG00000257743 NA 2.380407e-05
# 15 ENSG00000261804 NA 6.683306e-05
df_all[complete.cases(df_all), ] ## common set
# genes padj.x padj.y
# 2 ENSG00000106714 3.430565e-05 8.285065e-07
# 4 ENSG00000165115 9.165799e-05 2.030988e-07
# 9 ENSG00000223401 9.219857e-03 8.685631e-04
Data:
df_x <- structure(list(genes = c("ENSG00000106714", "ENSG00000165115",
"ENSG00000188056", "ENSG00000254340", "ENSG00000196083", "ENSG00000236438",
"ENSG00000223401", "ENSG00000233013", "ENSG00000214077"), padj = c(3.43056474068292e-05,
9.165798954677e-05, 0.000846081653952, 0.003678623922293, 0.007632605419062,
0.007632605419062, 0.009219856694741, 0.010086733081366, 0.009703019961401
)), class = "data.frame", row.names = c(NA, -9L))
df_y <- structure(list(genes = c("ENSG00000165115", "ENSG00000106714",
"ENSG00000111087", "ENSG00000257743", "ENSG00000261804", "ENSG00000251260",
"ENSG00000223401", "ENSG00000089505", "ENSG00000213934"), padj = c(2.0309880524746e-07,
8.28506484840271e-07, 3.65536414358691e-06, 2.38040674140299e-05,
6.68330615861441e-05, 0.000868563126637, 0.000868563126637, 0.001233018119198,
0.001735780758384)), class = "data.frame", row.names = c(NA,
-9L))