Home > OS >  Web scraping loop for pages and node in R
Web scraping loop for pages and node in R

Time:10-27

Im want make a scrape of this node. In one page run well but I need the other 342 pages. About pages only change the final number, like 1, 2 , 3 to 342.

library(rvest)
library(xml2)
library(httr) 

# For page 1
website<-'https://cgspace.cgiar.org/discover? 
rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=1'
link <-  vector()
#loop through nodes
for (i in 1:10){
  link[i] <-website  %>% 
    read_html() %>%
    html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search- 
    results"]/div[',i,']/div[2]/div/div[1]/a')) %>%
    html_attr('href')
  
}  
pag <- data.frame(link)
pag$link2 <- paste0('https://cgspace.cgiar.org', pag$link)
pag

# link                                        link2
# 1  /handle/10568/71370 https://cgspace.cgiar.org/handle/10568/71370
# 2  /handle/10568/43831 https://cgspace.cgiar.org/handle/10568/43831
# 3  /handle/10568/56285 https://cgspace.cgiar.org/handle/10568/56285


# For page 2
website<-'https://cgspace.cgiar.org/discover?  
rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=2'
link <-  vector()
#loop through nodes
for (i in 1:10){
  link[i] <-website  %>% 
    read_html() %>%
    html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search- 
    results"]/div[',i,']/div[2]/div/div[1]/a')) %>%
    html_attr('href')
  
}  
pag2 <- data.frame(link)
pag2$link2 <- paste0('https://cgspace.cgiar.org', pag2$link)
pag2

# link                                        link2
# 1  /handle/10568/90626 https://cgspace.cgiar.org/handle/10568/90626
# 2  /handle/10568/71796 https://cgspace.cgiar.org/handle/10568/71796
# 3  /handle/10568/68788 https://cgspace.cgiar.org/handle/10568/68788

The idea is make this in a single loop and have a data frame.

CodePudding user response:

Put it in a loop

library(rvest)
library(xml2)
library(httr) 
all_pags <- data.frame()
for( i in 1:342){
    website<-paste0('https://cgspace.cgiar.org/discover?rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=',i)
     link <-  vector()
     #loop through nodes
     for (i in 1:10){
       link[i] <-website  %>% 
         read_html() %>%
         html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search-results"]/div[',i,']/div[2]/div/div[1]/a')) %>% html_attr('href')
    
      }  
      pag <- data.frame(link)
      pag$link2 <- paste0('https://cgspace.cgiar.org', pag$link)
      all_pags <- rbind(all_pags, pag)
}
all_pags
  • Related