I am trying to scrap the list of compound synonyms from this website: https://pubchem.ncbi.nlm.nih.gov/compound/440917#section=Depositor-Supplied-Synonyms&fullscreen=true
My current code looks like this:
dl <- read_html("https://pubchem.ncbi.nlm.nih.gov/compound/440917#section=Depositor-Supplied-Synonyms&fullscreen=true")
get_synonyms <- function(x){
x %>%
html_nodes(".section-content-item") %>%
html_text()
}
get_synonyms(dl)
I want to be able to do this for multiple compounds from PubChem so I am using a function. I am unsure what to put in the html_nodes() function based on the website's structure. The following did not work:
section-content-item
, section-content
, Depositor-Supplied-Synonyms
all which seem to be classes holding the table of synonyms.
Thank you for any help
CodePudding user response:
The text in rendered in JavaScript, so it is easier to scrape the API as JSON and proceed by parsing it. You would need something like Selenium to scrape regularly. I tried with citric acid (311), and the same procedure works by substituting 311 for 440917 in the URL. Let me know if this works.
Additionally, I have added a procedure for a column of multiple chemicals.
library(tidyverse)
library(jsonlite)
data <- jsonlite::fromJSON("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/511/JSON/?heading=Depositor Supplied Synonyms")
data$Record$Section$Section[[1]]$Section[[1]]$Information[[1]][[2]][[1,1]]
#For multiple chemicals
df <- as_tibble_col(c(311, 440917, 5280450, 16129778, 1175), "IDs") %>%
rowwise() %>%
mutate(synonyms = list(jsonlite::fromJSON(paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/", IDs, "/JSON/?heading=Depositor Supplied Synonyms", sep = ""))$Record$Section$Section[[1]]$Section[[1]]$Information[[1]][[2]][[1,1]])) %>%
ungroup()
CodePudding user response:
The following function will return all the synonyms for a given compound from the site, as long as you know the ID number:
library(httr)
library(dplyr)
get_synonyms <- function(compound) {
GET(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/",
compound, "/JSON/?heading=Depositor Supplied Synonyms")) %>%
content("parsed") %>%
{.$Record$Section[[1]]$Section[[1]]$Section[[1]]$Information} %>%
unlist() %>%
as.vector()
}
For example, with your Limolene case:
get_synonyms(440917)
#> [1] "84"
#> [2] "D-Limonene"
#> [3] "5989-27-5"
#> [4] "(R)-( )-Limonene"
#> [5] "( )-Limonene"
#> [6] "(D)-Limonene"
#> [7] "( )-(4R)-Limonene"
#> [8] "( )-carvene"
#> [9] "(4R)-Limonene"
#> [10] "D-( )-Limonene"
#> [11] "D-Limonen"
#> [12] "(R)-Limonene"
#> [13] "(R)-p-Mentha-1,8-diene"
#> [14] "Citrene"
#> [15] "( )-p-Mentha-1,8-diene"
#> [16] "(R)-4-Isopropenyl-1-methyl-1-cyclohexene"
#> [17] "Limonene, D-"
#> [18] "( )-R-Limonene"
#> [19] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (4R)-"
#> [20] "d-p-Mentha-1,8-diene"
#> [21] "( )-4-Isopropenyl-1-methylcyclohexene"
#> [22] "(4R)-4-isopropenyl-1-methylcyclohexene"
#> [23] "(R)-( )-p-Mentha-1,8-diene"
#> [24] "FEMA No. 2633"
#> [25] "(R)-1-Methyl-4-(1-methylethenyl)cyclohexene"
#> [26] "( )-(R)-Limonene"
#> [27] "UNII-GFD7C86Q1W"
#> [28] "r-( )-limonene"
#> [29] "(R)-1-Methyl-4-(prop-1-en-2-yl)cyclohex-1-ene"
#> [30] "(4R)-1-methyl-4-prop-1-en-2-ylcyclohexene"
#> [31] "MFCD00062991"
#> [32] "GFD7C86Q1W"
#> [33] "4betaH-p-mentha-1,8-diene"
#> [34] "CHEBI:15382"
#> [35] "( ) Limonene"
#> [36] "( )-Dipentene"
#> [37] "Carvene"
#> [38] "Glidesafe"
#> [39] "Glidsafe"
#> [40] "Kautschiin"
#> [41] "Refchole"
#> [42] "(4R)-1-methyl-4-isopropenylcyclohex-1-ene"
#> [43] "(4R)-1-methyl-4-(1-methylethenyl)cyclohexene"
#> [44] "Biogenic SE 374"
#> [45] "( )-alpha-Limonene"
#> [46] "d-Limonene (natural)"
#> [47] "d-Limoneno [Spanish]"
#> [48] "Limonene, ( )-"
#> [49] "Limonene, dl-"
#> [50] "d-Limoneno"
#> [51] "Hemo-sol"
#> [52] "(4R)-( )-Limonene"
#> [53] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (R)-"
#> [54] "D-limonene [JAN]"
#> [55] "(4R)-4-isopropenyl-1-methyl-cyclohexene"
#> [56] "Citrus stripper oil"
#> [57] "CCRIS 671"
#> [58] "EC 7"
#> [59] "HSDB 4186"
#> [60] "D-1,8-p-Menthadiene"
#> [61] "NCI-C55572"
#> [62] "EINECS 227-813-5"
#> [63] "p-Mentha-1,8-diene, (R)-( )-"
#> [64] "NSC-844"
#> [65] "Sulfate turpentine, distilled"
#> [66] "( )-1,8-para-Menthadiene"
#> [67] "Dextro-limonene"
#> [68] "d limonene"
#> [69] "AI3-15191"
#> [70] "NSC-21446"
#> [71] "Orange x"
#> [72] "NSC-757069"
#> [73] "1-Methyl-4-(1-methylethenyl)cyclohexene, (R)-"
#> [74] "EINECS 266-034-5"
#> [75] "(4R)-1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene"
#> [76] "Dipentene no. 122"
#> [77] "D-Limonene Reagent Grade"
#> [78] "DSSTox_CID_778"
#> [79] "EC 227-813-5"
#> [80] "DSSTox_RID_75785"
#> [81] "( )-Limonene, stabilized with 0.03% tocopherol"
#> [82] "DSSTox_GSID_20778"
#> [83] "CHEMBL449062"
#> [84] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (theta)-"
#> [85] "DTXSID1020778"
#> [86] "(R)-( )-Limonene, 95%"
#> [87] "(R)-( )-Limonene, 97%"
#> [88] "ZINC967513"
#> [89] "CS-M3273"
#> [90] "(R)-( )-Limonene, >=93%"
#> [91] "Tox21_200400"
#> [92] "6458AE"
#> [93] "AKOS015899935"
#> [94] "CCG-266134"
#> [95] "DB08921"
#> [96] "LMPR0102090013"
#> [97] "NSC 757069"
#> [98] "(R)-( )-Limonene, analytical standard"
#> [99] "NCGC00248591-01"
#> [100] "NCGC00248591-02"
#> [101] "NCGC00257954-01"
#> [102] "BS-22387"
#> [103] "CAS-5989-27-5"
#> [104] "(R)-( )-4-Isopropenyl-1-methylcyclohexene"
#> [105] "L0047"
#> [106] "L0105"
#> [107] "(R)-Limonene 2000 microg/mL in Acetonitrile"
#> [108] "C06099"
#> [109] "D91245"
#> [110] "(4R)-1-Methyl-4-(prop-1-en-2-yl)cyclohexene"
#> [111] "J-502148"
#> [112] "W-105295"
#> [113] "Q27888324"
#> [114] "(R)-( )-Limonene, primary pharmaceutical reference standard"
#> [115] "UNII-9MC3I34447 component XMGQYMWWDOXHJM-JTQLQIEISA-N"
#> [116] "(R)-( )-Limonene, purum, >=96.0% (sum of enantiomers, GC)"
#> [117] "(R)-( )-Limonene, technical, ~90% (sum of enantiomers, GC)"
Or hydrochloric acid (313)
get_synonyms(313)
#> [1] "74"
#> [2] "hydrochloric acid"
#> [3] "hydrogen chloride"
#> [4] "7647-01-0"
#> [5] "Muriatic acid"
#> [6] "Chlorohydric acid"
#> [7] "chlorane"
#> [8] "Acide chlorhydrique"
#> [9] "Chlorwasserstoff"
#> [10] "Anhydrous hydrochloric acid"
#> [11] "Spirits of salt"
#> [12] "Hydrogen chloride (HCl)"
#> [13] "Chloorwaterstof"
#> [14] "Chlorowodor"
#> [15] "Acido cloridrico"
#> [16] "Muriaticum acidum"
#> [17] "Aqueous hydrogen chloride"
#> [18] "chlorure d'hydrogene"
#> [19] "Hydrochloric acid gas"
#> [20] "Marine acid"
#> [21] "monohydrochloride"
#> [22] "Spirit of salt"
#> [23] "UNII-QTT17582CB"
#> [24] "NSC 77365"
#> [25] "CHEBI:17883"
#> [26] "Hydrogen chloride (acid)"
#> [27] "[HCl]"
#> [28] "HCl"
#> [29] "QTT17582CB"
#> [30] "MFCD00011324"
#> [31] "NSC-77365"
#> [32] "E507"
#> [33] "Bowl Cleaner"
#> [34] "4-D Bowl Sanitizer"
#> [35] "Chlorowodor [Polish]"
#> [36] "Hydrochloric Acid Solution, 1N"
#> [37] "Emulsion Bowl Cleaner"
#> [38] "Caswell No. 486"
#> [39] "Hydrogenchlorid"
#> [40] "Chloorwaterstof [Dutch]"
#> [41] "o-Tolidine Dihydrochloride Solution"
#> [42] "Hydrochloric acid [JAN]"
#> [43] "Chlorwasserstoff [German]"
#> [44] "Hydrogen Chloride - Methanol Reagent"
#> [45] "Titanium, Reference Standard Solution"
#> [46] "Vanadium, Reference Standard Solution"
#> [47] "Acido clorhidrico"
#> [48] "UN 1789 (solution)"
#> [49] "Hydrochloric acid, ACS reagent, 37%"
#> [50] "UN 1050 (anhydrous)"
#> [51] "mono hydrochloride"
#> [52] "Acido cloridrico [Italian]"
#> [53] "Platinum Cobalt Color Standard Solution"
#> [54] "White Emulsion Bowl Cleaner"
#> [55] "Acido clorhidrico [Spanish]"
#> [56] "Varley Poly-Pak Bowl Creme"
#> [57] "Acide chlorhydrique [French]"
#> [58] "Hydrogen chloride (gas only)"
#> [59] "Hydrochloric Acid Solution, 0.2N (N/5)"
#> [60] "Hydrochloric Acid Solution, 0.5N (N/2)"
#> [61] "Chlorure d'hydrogene [French]"
#> [62] "Chloruro de hidrogeno"
#> [63] "HSDB 545"
#> [64] "Hydrochloric Acid Solution, 0.1N (N/10)"
#> [65] "Chloruro de hidrogeno [Spanish]"
#> [66] "Hygeia Creme Magic Bowl Cleaner"
#> [67] "Percleen Bowl and Urinal Cleaner"
#> [68] "Hydrogen chloride solution 1.0M in ethyl acetate"
#> [69] "EINECS 231-595-7"
#> [70] "UN1050"
#> [71] "UN1789"
#> [72] "UN2186"
#> [73] "Anhydrous hydrogen chloride"
#> [74] "Wuest Bowl Cleaner Super Concentrated"
#> [75] "Chlorure d'hydrogene anhydre [French]"
#> [76] "Cloruro de hidrogeno anhidro [Spanish]"
#> [77] "EPA Pesticide Chemical Code 045901"
#> [78] "Chlorure d'hydrogene anhydre"
#> [79] "Cloruro de hidrogeno anhidro"
#> [80] "UN 2186 (refrigerated liquefied gas)"
#> [81] "chloro"
#> [82] "chlorum"
#> [83] "hydochloride"
#> [84] "hydrochlorie"
#> [85] "hydrochoride"
#> [86] "hydrocloride"
#> [87] "Salzsaeure"
#> [88] "Hydrochloric acid [JAN:NF]"
#> [89] "chloridohydrogen"
#> [90] "hydro chloride"
#> [91] "hydro-chloride"
#> [92] "hydrogenchloride"
#> [93] "Chloro radical"
#> [94] "Soldering acid"
#> [95] "chlorhydric acid"
#> [96] "hydochloric acid"
#> [97] "hydogen chloride"
#> [98] "hydrochoric acid"
#> [99] "hydrocloric acid"
#> [100] "hydrogen chlorid"
Created on 2022-07-21 by the reprex package (v2.0.1)