Select the correct html element with rvest-CodePudding

Im some ocassion a Stack user help me for make this script. Im edit it for add more attributes but I have problems when try to add Authors

The Author label is next to target and href. I have problem in this part.

 library(tidyverse)
 library(rvest)

 startTime <- Sys.time()
 get_cg <- function(pages) {

   cat("Scraping page", pages, "\n")

   page <-
   str_c("https://cgspace.cgiar.org/discover? 
   scope=10568/106146&query=cassava&submit=&rpp=10&page=", pages) %>%
   read_html()


  tibble(
  title = page %>%
  html_elements(".ds-artifact-item") %>%
  html_element(".description-info") %>%
  html_text2(), # run well

  fecha = page %>% 
  html_elements(".ds-artifact-item") %>%
  html_element(".date") %>%
  html_text2(), # run well

  Type = page %>% 
  html_elements(".ds-artifact-item") %>%
  html_element(".artifact-type") %>%
  html_text2(), # run well

  Autor= page %>% 
  html_elements(".ds-artifact-item") %>% 
  html_element(".description-info") %>%
  html_attr("href"), # not download the Authors

  link = page %>%
  html_elements(".ds-artifact-item") %>%
  html_element(".description-info") %>%
  html_attr("href") %>% # run well
  str_c("https://cgspace.cgiar.org", .)
  )
     }

  df <- map_dfr(1, get_cg)

  endTime <- Sys.time()
  print(endTime - startTim)

Im try with other selector but get NA

CodePudding user response：

This should get you a collapsed list of authors for each book, separated by ; , basically the same as presented on the page:

library(tidyverse, warn.conflicts = F)
library(rvest, warn.conflicts = F)

startTime <- Sys.time()
get_cg <- function(pages) {
  
  cat("Scraping page", pages, "\n")
  
  page <-
    str_c("https://cgspace.cgiar.org/discover?scope=10568/106146&query=cassava&submit=&rpp=10&page=", pages) %>%
    read_html()
  
  html_elements(page, "div.artifact-description > div.artifact-description")  %>% 
    map_df(~ list(
      title = html_element(.x, ".description-info") %>% html_text2(),
      fecha = html_element(.x, ".date") %>% html_text2(),
      Type  = html_element(.x, ".artifact-type") %>% html_text2(),
      # Autor_links = html_elements(.x,".description-info > span  > a") %>% html_attr("href") %>% paste(collapse = ";"),
      Autor = html_element(.x, "span.description-info") %>% html_text2(),
      link  = html_element(.x, "a.description-info") %>% html_attr("href") %>% str_c("https://cgspace.cgiar.org", .)
    )) 
}

df <- map_dfr(1, get_cg)
#> Scraping page 1

endTime <- Sys.time()
print(endTime - startTime)
#> Time difference of 0.989037 secs

Result:

df
#> # A tibble: 10 × 5
#>    title                                                 fecha Type  Autor link 
#>    <chr>                                                 <chr> <chr> <chr> <chr>
#>  1 Global Climate Regions for Cassava                    2020… Type… Hyma… http…
#>  2 Performance of the CSM–MANIHOT–Cassava model for sim… 2021… Type… Phon… http…
#>  3 Adoption of cassava improved modern varieties in the… 2020  Type… Laba… http…
#>  4 First report of Sri Lankan cassava mosaic virus and … 2021… Type… Chit… http…
#>  5 Surveillance and diagnostics dataset on Sri Lankan c… 2020  Type… Siri… http…
#>  6 Socieconomic and soil conservation practices for cas… 2022… Type… Ibar… http…
#>  7 The transformation and outcome of traditional cassav… 2020  Type… Dou,… http…
#>  8 Cassava Annual Report 2019                            2020  Type… Inte… http…
#>  9 Cassava Annual Report 2020                            2021… Type… Bece… http…
#> 10 Adoption of cassava improved modern varieties in the… 2020  Type… Flor… http…

glimpse(df)
#> Rows: 10
#> Columns: 5
#> $ title <chr> "Global Climate Regions for Cassava", "Performance of the CSM–MA…
#> $ fecha <chr> "2020-08-03", "2021-05-01", "2020", "2021-04-23", "2020", "2022-…
#> $ Type  <chr> "Type:Dataset", "Type:Journal Article", "Type:Dataset", "Type:Jo…
#> $ Autor <chr> "Hyman, Glenn G.", "Phoncharoen, Phanupong; Banterng, Poramate; …
#> $ link  <chr> "https://cgspace.cgiar.org/handle/10568/109500", "https://cgspac…

^{Created on 2022-12-03 with reprex v2.0.2}

CodePudding user response：

In the code you posted, you're using html_element to extract the Autor and link fields, but html_element only selects the first matching element. You should use html_nodes instead, which will return all matching elements.

Here's how you can use html_nodes to extract the Autor and link fields:

Autor = page %>%
  html_elements(".ds-artifact-item") %>%
  html_nodes(".description-info") %>%
  html_attr("href"),

link = page %>%
  html_elements(".ds-artifact-item") %>%
  html_nodes(".description-info") %>%
  html_attr("href") %>%
  str_c("https://cgspace.cgiar.org", .)

Note that html_nodes returns a list of elements, so you'll need to use map_chr or another function to extract the href attributes from the list.

For example, you can use map_chr like this:

Autor = page %>%
  html_elements(".ds-artifact-item") %>%
  html_nodes(".description-info") %>%
  map_chr("href"),

link = page %>%
  html_elements(".ds-artifact-item") %>%
  html_nodes(".description-info") %>%
  map_chr("href") %>%
  str_c("https://cgspace.cgiar.org", .)

This should extract the href attributes for all matching elements and return them as a character vector. You can then use this vector to create the Autor and link columns in your data frame.