Create a new file with common and unique genes from two data frames in R-CodePudding

i have a dataframe with genes like this :

genes   padj
ENSG00000106714 3.43056474068292E-05
ENSG00000165115 9.165798954677E-05
ENSG00000188056 0.000846081653952
ENSG00000254340 0.003678623922293
ENSG00000196083 0.007632605419062
ENSG00000236438 0.007632605419062
ENSG00000223401 0.009219856694741
ENSG00000233013 0.010086733081366
ENSG00000214077 0.009703019961401

and another one like this

genes   padj
ENSG00000165115 2.0309880524746E-07
ENSG00000106714 8.28506484840271E-07
ENSG00000111087 3.65536414358691E-06
ENSG00000257743 2.38040674140299E-05
ENSG00000261804 6.68330615861441E-05
ENSG00000251260 0.000868563126637
ENSG00000223401 0.000868563126637
ENSG00000089505 0.001233018119198
ENSG00000213934 0.001735780758384

and i would like to create a new data frame in which i will have the common genes between the two files , the unique from file 1 and the unique from file 2.

CodePudding user response：

You can use dplyr library to create a new data frame containing common genes with unique padj with it.

Here's how you'll do it

library(dplyr)
df <- inner_join(genes1, genes2, by = 'genes')
df

The output will look like this:

           genes      padj.x      padj.y
1 ENSG00000106714 0.000034300 0.000000829
2 ENSG00000165115 0.000091700 0.000000203
3 ENSG00000223401 0.009219857 0.000868563

And if you want all the genes to show up in your data frame along with respective padj you can use full_join instead of inner_join.

df <- full_join(genes1, genes2, by = 'genes')
df

The output of full_join looks like this:

             genes      padj.x      padj.y
1  ENSG00000106714 0.000034300 0.000000829
2  ENSG00000165115 0.000091700 0.000000203
3  ENSG00000188056 0.000846082          NA
4  ENSG00000254340 0.003678624          NA
5  ENSG00000196083 0.007632605          NA
6  ENSG00000236438 0.007632605          NA
7  ENSG00000223401 0.009219857 0.000868563
8  ENSG00000233013 0.010086733          NA
9  ENSG00000214077 0.009703020          NA
10 ENSG00000111087          NA 0.000003660
11 ENSG00000257743          NA 0.000023800
12 ENSG00000261804          NA 0.000066800
13 ENSG00000251260          NA 0.000868563
14 ENSG00000089505          NA 0.001233018
15 ENSG00000213934          NA 0.001735781

CodePudding user response：

Using merge() and subsetting after.

(df_all <- merge(df_x, df_y, by="genes", all=TRUE))  ## complete gene set
#              genes       padj.x       padj.y
# 1  ENSG00000089505           NA 1.233018e-03
# 2  ENSG00000106714 3.430565e-05 8.285065e-07
# 3  ENSG00000111087           NA 3.655364e-06
# 4  ENSG00000165115 9.165799e-05 2.030988e-07
# 5  ENSG00000188056 8.460817e-04           NA
# 6  ENSG00000196083 7.632605e-03           NA
# 7  ENSG00000213934           NA 1.735781e-03
# 8  ENSG00000214077 9.703020e-03           NA
# 9  ENSG00000223401 9.219857e-03 8.685631e-04
# 10 ENSG00000233013 1.008673e-02           NA
# 11 ENSG00000236438 7.632605e-03           NA
# 12 ENSG00000251260           NA 8.685631e-04
# 13 ENSG00000254340 3.678624e-03           NA
# 14 ENSG00000257743           NA 2.380407e-05
# 15 ENSG00000261804           NA 6.683306e-05

df_all[is.na(df_all$padj.y), ]  ## complete x set
#              genes       padj.x padj.y
# 5  ENSG00000188056 0.0008460817     NA
# 6  ENSG00000196083 0.0076326054     NA
# 8  ENSG00000214077 0.0097030200     NA
# 10 ENSG00000233013 0.0100867331     NA
# 11 ENSG00000236438 0.0076326054     NA
# 13 ENSG00000254340 0.0036786239     NA

df_all[is.na(df_all$padj.x), ]  ## complete y set
#              genes padj.x       padj.y
# 1  ENSG00000089505     NA 1.233018e-03
# 3  ENSG00000111087     NA 3.655364e-06
# 7  ENSG00000213934     NA 1.735781e-03
# 12 ENSG00000251260     NA 8.685631e-04
# 14 ENSG00000257743     NA 2.380407e-05
# 15 ENSG00000261804     NA 6.683306e-05

df_all[complete.cases(df_all), ]  ## common set
#             genes       padj.x       padj.y
# 2 ENSG00000106714 3.430565e-05 8.285065e-07
# 4 ENSG00000165115 9.165799e-05 2.030988e-07
# 9 ENSG00000223401 9.219857e-03 8.685631e-04

Data:

df_x <- structure(list(genes = c("ENSG00000106714", "ENSG00000165115", 
"ENSG00000188056", "ENSG00000254340", "ENSG00000196083", "ENSG00000236438", 
"ENSG00000223401", "ENSG00000233013", "ENSG00000214077"), padj = c(3.43056474068292e-05, 
9.165798954677e-05, 0.000846081653952, 0.003678623922293, 0.007632605419062, 
0.007632605419062, 0.009219856694741, 0.010086733081366, 0.009703019961401
)), class = "data.frame", row.names = c(NA, -9L))

df_y <- structure(list(genes = c("ENSG00000165115", "ENSG00000106714", 
"ENSG00000111087", "ENSG00000257743", "ENSG00000261804", "ENSG00000251260", 
"ENSG00000223401", "ENSG00000089505", "ENSG00000213934"), padj = c(2.0309880524746e-07, 
8.28506484840271e-07, 3.65536414358691e-06, 2.38040674140299e-05, 
6.68330615861441e-05, 0.000868563126637, 0.000868563126637, 0.001233018119198, 
0.001735780758384)), class = "data.frame", row.names = c(NA, 
-9L))