Compare two columns of distinct data frames and create another column in R-CodePudding

I have the following data frames:

df1

> head(df1)
  data.source                  data.term_name data.term_id data.adjusted_p_value 
1       GO:MF              catalytic activity   GO:0003824         4.940656e-324                                323.3062
2       GO:MF              molecular_function   GO:0003674         4.940656e-324                                323.3062
3       GO:MF                         binding   GO:0005488         4.940656e-324                                323.3062
4       GO:MF                     ion binding   GO:0043167         4.940656e-324                                323.3062
5       GO:MF organic cyclic compound binding   GO:0097159         9.573609e-319                                318.0189
6       GO:MF   heterocyclic compound binding   GO:1901363

df2

> head(filter_tig2)
# A tibble: 6 × 5
# Groups:   g0.seq_id, g0.product [6]
  g0.seq_id g0.product                                     g0.ontology_term                                    name_count process
  <chr>     <chr>                                          <chr>                                                    <int> <lgl>  
1 000000F   3-hydroxy-3-methylglutaryl coenzyme A synthase GO:0003824,GO:0008152,GO:0004421,GO:0003824                  1 NA     
2 000000F   3-isopropylmalate dehydratase                  GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:000…          1 NA     
3 000000F   40S ribosomal protein S17                      GO:0006412,GO:0003735,GO:0005622,GO:0005840                  1 NA     
4 000000F   40S ribosomal protein S20                      GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:001…          1 NA     
5 000000F   40S ribosomal protein S26                      GO:0006412,GO:0003735,GO:0005622,GO:0005840                  1 NA     
6 000000F   40S ribosomal protein S29                      GO:0003735,GO:0006412,GO:0005840,GO:0005622                  1 NA

I need to create a new column in df2 with the data.source values based on *data.term_id from df1

df1

> dput(head(df1))
structure(list(data.source = c("GO:MF", "GO:MF", "GO:MF", "GO:MF", 
"GO:MF", "GO:MF"), data.term_name = c("catalytic activity", "molecular_function", 
"binding", "ion binding", "organic cyclic compound binding", 
"heterocyclic compound binding"), data.term_id = c("GO:0003824", 
"GO:0003674", "GO:0005488", "GO:0043167", "GO:0097159", "GO:1901363"
), data.adjusted_p_value = c(4.94065645841247e-324, 4.94065645841247e-324, 
4.94065645841247e-324, 4.94065645841247e-324, 9.573608832595e-319, 
4.68422156625148e-318), data.negative_log10_of_adjusted_p_value = c(323.306215343116, 
323.306215343116, 323.306215343116, 323.306215343116, 318.018924321299, 
317.329362570746), data.term_size = c(3786L, 6157L, 3549L, 2192L, 
2116L, 2112L), data.query_size = c(7587L, 7587L, 7587L, 7587L, 
7587L, 7587L), data.intersection_size = c(3786L, 6153L, 3549L, 
2192L, 2116L, 2112L), data.effective_domain_size = c(10286L, 
10286L, 10286L, 10286L, 10286L, 10286L)), row.names = c(NA, 6L
), class = "data.frame")

df2

> dput(head(df2))
structure(list(g0.seq_id = c("000000F", "000000F", "000000F", 
"000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase", 
"3-isopropylmalate dehydratase", "40S ribosomal protein S17", 
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), g0.ontology_term = c("GO:0008299,GO:0008152,GO:0004421,GO:0003824", 
"GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:0000287,GO:0051287,GO:0005737", 
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:0015935,GO:0005622", 
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0003735,GO:0006412,GO:0005840,GO:0005622"
), name_count = c(1L, 1L, 1L, 1L, 1L, 1L), process = c(NA, NA, 
NA, NA, NA, NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), groups = structure(list(g0.seq_id = c("000000F", 
"000000F", "000000F", "000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase", 
"3-isopropylmalate dehydratase", "40S ribosomal protein S17", 
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))

So I need to search if the value of df2$g0.ontology_term are in df1$data.term_id, if yes put the data.source correspondent in the new column.

For example:

I have the GO:0003824 in df2$g0.ontology_term , so my output will be:

 data.source                  data.term_name data.term_id data.adjusted_p_value new_column
1       GO:MF              catalytic activity   GO:0003824         4.940656e-324  GO:MF

I tried with dplyr but I can`t do that.

Anyone could help me?

CodePudding user response：

I think this is what you want to do, i.e. add a data.source to df2 for the one row that matches? (Your example output is augmenting df1, so I've aligned to the words.)

library(tidyverse)

df1 <- tribble(
  ~data.source, ~data.term_name, ~data.term_id, ~data.adjusted_p_value,
  "GO:MF", "catalytic activity", "GO:0003824", "4.940656e-324",
  "GO:MF", "molecular_function", "GO:0003674", "4.940656e-324",
  "GO:MF", "binding", "GO:0005488", "4.940656e-324",
  "GO:MF", "ion binding", "GO:0043167", "4.940656e-324",
  "GO:MF", "organic cyclic compound binding", "GO:0097159", "9.573609e-319",
  "GO:MF", "heterocyclic compound binding", "GO:1901363", NA
)

df2 <- tribble(
  ~g0.seq_id, ~g0.product, ~g0.ontology_term, ~name_count, ~process,
  "000000F", "3-hydroxy-3-methylglutaryl coenzyme A synthase", "GO:0003824 GO:0008152,GO:0004421,GO:0003824", 1, NA,
  "000000F", "3-isopropylmalate dehydratase", "GO:0055114 GO:0009098,GO:0016616,GO:0003862,GO:000…", 1, NA,
  "000000F", "40S ribosomal protein S17", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
  "000000F", "40S ribosomal protein S20", "GO:0006412 GO:0003735,GO:0003723,GO:0005840,GO:001…", 1, NA,
  "000000F", "40S ribosomal protein S26", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
  "000000F", "40S ribosomal protein S29", "GO:0003735 GO:0006412,GO:0005840,GO:0005622", 1, NA
)

df3 <- df2 |> 
  separate(g0.ontology_term, into = c("data.term_id", "g0.ontology_term part2"), sep = " ") |> 
  inner_join(df1 |> select(data.source, data.term_id), by = "data.term_id")

df3
#> # A tibble: 1 × 7
#>   g0.seq_id g0.product          data.term_id `g0.ontology_t…` name_count process
#>   <chr>     <chr>               <chr>        <chr>                 <dbl> <lgl>  
#> 1 000000F   3-hydroxy-3-methyl… GO:0003824   GO:0008152,GO:0…          1 NA     
#> # … with 1 more variable: data.source <chr>

^{Created on 2022-05-18 by the reprex package (v2.0.1)}