I have the following data frames:
df1
> head(df1)
data.source data.term_name data.term_id data.adjusted_p_value
1 GO:MF catalytic activity GO:0003824 4.940656e-324 323.3062
2 GO:MF molecular_function GO:0003674 4.940656e-324 323.3062
3 GO:MF binding GO:0005488 4.940656e-324 323.3062
4 GO:MF ion binding GO:0043167 4.940656e-324 323.3062
5 GO:MF organic cyclic compound binding GO:0097159 9.573609e-319 318.0189
6 GO:MF heterocyclic compound binding GO:1901363
df2
> head(filter_tig2)
# A tibble: 6 × 5
# Groups: g0.seq_id, g0.product [6]
g0.seq_id g0.product g0.ontology_term name_count process
<chr> <chr> <chr> <int> <lgl>
1 000000F 3-hydroxy-3-methylglutaryl coenzyme A synthase GO:0003824,GO:0008152,GO:0004421,GO:0003824 1 NA
2 000000F 3-isopropylmalate dehydratase GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:000… 1 NA
3 000000F 40S ribosomal protein S17 GO:0006412,GO:0003735,GO:0005622,GO:0005840 1 NA
4 000000F 40S ribosomal protein S20 GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:001… 1 NA
5 000000F 40S ribosomal protein S26 GO:0006412,GO:0003735,GO:0005622,GO:0005840 1 NA
6 000000F 40S ribosomal protein S29 GO:0003735,GO:0006412,GO:0005840,GO:0005622 1 NA
I need to create a new column in df2 with the data.source values based on *data.term_id from df1
df1
> dput(head(df1))
structure(list(data.source = c("GO:MF", "GO:MF", "GO:MF", "GO:MF",
"GO:MF", "GO:MF"), data.term_name = c("catalytic activity", "molecular_function",
"binding", "ion binding", "organic cyclic compound binding",
"heterocyclic compound binding"), data.term_id = c("GO:0003824",
"GO:0003674", "GO:0005488", "GO:0043167", "GO:0097159", "GO:1901363"
), data.adjusted_p_value = c(4.94065645841247e-324, 4.94065645841247e-324,
4.94065645841247e-324, 4.94065645841247e-324, 9.573608832595e-319,
4.68422156625148e-318), data.negative_log10_of_adjusted_p_value = c(323.306215343116,
323.306215343116, 323.306215343116, 323.306215343116, 318.018924321299,
317.329362570746), data.term_size = c(3786L, 6157L, 3549L, 2192L,
2116L, 2112L), data.query_size = c(7587L, 7587L, 7587L, 7587L,
7587L, 7587L), data.intersection_size = c(3786L, 6153L, 3549L,
2192L, 2116L, 2112L), data.effective_domain_size = c(10286L,
10286L, 10286L, 10286L, 10286L, 10286L)), row.names = c(NA, 6L
), class = "data.frame")
df2
> dput(head(df2))
structure(list(g0.seq_id = c("000000F", "000000F", "000000F",
"000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase",
"3-isopropylmalate dehydratase", "40S ribosomal protein S17",
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), g0.ontology_term = c("GO:0008299,GO:0008152,GO:0004421,GO:0003824",
"GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:0000287,GO:0051287,GO:0005737",
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:0015935,GO:0005622",
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0003735,GO:0006412,GO:0005840,GO:0005622"
), name_count = c(1L, 1L, 1L, 1L, 1L, 1L), process = c(NA, NA,
NA, NA, NA, NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), groups = structure(list(g0.seq_id = c("000000F",
"000000F", "000000F", "000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase",
"3-isopropylmalate dehydratase", "40S ribosomal protein S17",
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))
So I need to search if the value of df2$g0.ontology_term are in df1$data.term_id, if yes put the data.source correspondent in the new column.
For example:
I have the GO:0003824 in df2$g0.ontology_term , so my output will be:
data.source data.term_name data.term_id data.adjusted_p_value new_column
1 GO:MF catalytic activity GO:0003824 4.940656e-324 GO:MF
I tried with dplyr but I can`t do that.
Anyone could help me?
CodePudding user response:
I think this is what you want to do, i.e. add a data.source
to df2 for the one row that matches? (Your example output is augmenting df1, so I've aligned to the words.)
library(tidyverse)
df1 <- tribble(
~data.source, ~data.term_name, ~data.term_id, ~data.adjusted_p_value,
"GO:MF", "catalytic activity", "GO:0003824", "4.940656e-324",
"GO:MF", "molecular_function", "GO:0003674", "4.940656e-324",
"GO:MF", "binding", "GO:0005488", "4.940656e-324",
"GO:MF", "ion binding", "GO:0043167", "4.940656e-324",
"GO:MF", "organic cyclic compound binding", "GO:0097159", "9.573609e-319",
"GO:MF", "heterocyclic compound binding", "GO:1901363", NA
)
df2 <- tribble(
~g0.seq_id, ~g0.product, ~g0.ontology_term, ~name_count, ~process,
"000000F", "3-hydroxy-3-methylglutaryl coenzyme A synthase", "GO:0003824 GO:0008152,GO:0004421,GO:0003824", 1, NA,
"000000F", "3-isopropylmalate dehydratase", "GO:0055114 GO:0009098,GO:0016616,GO:0003862,GO:000…", 1, NA,
"000000F", "40S ribosomal protein S17", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
"000000F", "40S ribosomal protein S20", "GO:0006412 GO:0003735,GO:0003723,GO:0005840,GO:001…", 1, NA,
"000000F", "40S ribosomal protein S26", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
"000000F", "40S ribosomal protein S29", "GO:0003735 GO:0006412,GO:0005840,GO:0005622", 1, NA
)
df3 <- df2 |>
separate(g0.ontology_term, into = c("data.term_id", "g0.ontology_term part2"), sep = " ") |>
inner_join(df1 |> select(data.source, data.term_id), by = "data.term_id")
df3
#> # A tibble: 1 × 7
#> g0.seq_id g0.product data.term_id `g0.ontology_t…` name_count process
#> <chr> <chr> <chr> <chr> <dbl> <lgl>
#> 1 000000F 3-hydroxy-3-methyl… GO:0003824 GO:0008152,GO:0… 1 NA
#> # … with 1 more variable: data.source <chr>
Created on 2022-05-18 by the reprex package (v2.0.1)