Im some ocassion a Stack user help me for make this script. Im edit it for add more attributes but I have problems when try to add Authors
The Author label is next to target
and href
. I have problem in this part.
library(tidyverse)
library(rvest)
startTime <- Sys.time()
get_cg <- function(pages) {
cat("Scraping page", pages, "\n")
page <-
str_c("https://cgspace.cgiar.org/discover?
scope=10568/106146&query=cassava&submit=&rpp=10&page=", pages) %>%
read_html()
tibble(
title = page %>%
html_elements(".ds-artifact-item") %>%
html_element(".description-info") %>%
html_text2(), # run well
fecha = page %>%
html_elements(".ds-artifact-item") %>%
html_element(".date") %>%
html_text2(), # run well
Type = page %>%
html_elements(".ds-artifact-item") %>%
html_element(".artifact-type") %>%
html_text2(), # run well
Autor= page %>%
html_elements(".ds-artifact-item") %>%
html_element(".description-info") %>%
html_attr("href"), # not download the Authors
link = page %>%
html_elements(".ds-artifact-item") %>%
html_element(".description-info") %>%
html_attr("href") %>% # run well
str_c("https://cgspace.cgiar.org", .)
)
}
df <- map_dfr(1, get_cg)
endTime <- Sys.time()
print(endTime - startTim)
Im try with other selector but get NA
CodePudding user response:
This should get you a collapsed list of authors for each book, separated by ;
, basically the same as presented on the page:
library(tidyverse, warn.conflicts = F)
library(rvest, warn.conflicts = F)
startTime <- Sys.time()
get_cg <- function(pages) {
cat("Scraping page", pages, "\n")
page <-
str_c("https://cgspace.cgiar.org/discover?scope=10568/106146&query=cassava&submit=&rpp=10&page=", pages) %>%
read_html()
html_elements(page, "div.artifact-description > div.artifact-description") %>%
map_df(~ list(
title = html_element(.x, ".description-info") %>% html_text2(),
fecha = html_element(.x, ".date") %>% html_text2(),
Type = html_element(.x, ".artifact-type") %>% html_text2(),
# Autor_links = html_elements(.x,".description-info > span > a") %>% html_attr("href") %>% paste(collapse = ";"),
Autor = html_element(.x, "span.description-info") %>% html_text2(),
link = html_element(.x, "a.description-info") %>% html_attr("href") %>% str_c("https://cgspace.cgiar.org", .)
))
}
df <- map_dfr(1, get_cg)
#> Scraping page 1
endTime <- Sys.time()
print(endTime - startTime)
#> Time difference of 0.989037 secs
Result:
df
#> # A tibble: 10 × 5
#> title fecha Type Autor link
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Global Climate Regions for Cassava 2020… Type… Hyma… http…
#> 2 Performance of the CSM–MANIHOT–Cassava model for sim… 2021… Type… Phon… http…
#> 3 Adoption of cassava improved modern varieties in the… 2020 Type… Laba… http…
#> 4 First report of Sri Lankan cassava mosaic virus and … 2021… Type… Chit… http…
#> 5 Surveillance and diagnostics dataset on Sri Lankan c… 2020 Type… Siri… http…
#> 6 Socieconomic and soil conservation practices for cas… 2022… Type… Ibar… http…
#> 7 The transformation and outcome of traditional cassav… 2020 Type… Dou,… http…
#> 8 Cassava Annual Report 2019 2020 Type… Inte… http…
#> 9 Cassava Annual Report 2020 2021… Type… Bece… http…
#> 10 Adoption of cassava improved modern varieties in the… 2020 Type… Flor… http…
glimpse(df)
#> Rows: 10
#> Columns: 5
#> $ title <chr> "Global Climate Regions for Cassava", "Performance of the CSM–MA…
#> $ fecha <chr> "2020-08-03", "2021-05-01", "2020", "2021-04-23", "2020", "2022-…
#> $ Type <chr> "Type:Dataset", "Type:Journal Article", "Type:Dataset", "Type:Jo…
#> $ Autor <chr> "Hyman, Glenn G.", "Phoncharoen, Phanupong; Banterng, Poramate; …
#> $ link <chr> "https://cgspace.cgiar.org/handle/10568/109500", "https://cgspac…
Created on 2022-12-03 with reprex v2.0.2
CodePudding user response:
In the code you posted, you're using html_element to extract the Autor and link fields, but html_element only selects the first matching element. You should use html_nodes instead, which will return all matching elements.
Here's how you can use html_nodes to extract the Autor and link fields:
Autor = page %>%
html_elements(".ds-artifact-item") %>%
html_nodes(".description-info") %>%
html_attr("href"),
link = page %>%
html_elements(".ds-artifact-item") %>%
html_nodes(".description-info") %>%
html_attr("href") %>%
str_c("https://cgspace.cgiar.org", .)
Note that html_nodes returns a list of elements, so you'll need to use map_chr or another function to extract the href attributes from the list.
For example, you can use map_chr like this:
Autor = page %>%
html_elements(".ds-artifact-item") %>%
html_nodes(".description-info") %>%
map_chr("href"),
link = page %>%
html_elements(".ds-artifact-item") %>%
html_nodes(".description-info") %>%
map_chr("href") %>%
str_c("https://cgspace.cgiar.org", .)
This should extract the href attributes for all matching elements and return them as a character vector. You can then use this vector to create the Autor and link columns in your data frame.