Downloading pdfs on a webpage using rvest-CodePudding

I would like to download the PDFs from this

# Load the packages
library(rvest)
library(tidyverse)
library(pdftools)

# Scrape the website
url <- "https://reporting.standardbank.com/debt-investors/debt-securities/debt-securities/"
html <- read_html(url)

# Extract the links to the PDFs
pdf_links <- html_nodes(html, "a") %>%
  html_attr("href") %>%
  str_extract("(?<=\\.pdf). ")

# Download the PDFs
pdf_links %>%
  map(function(x) download.file(x, basename(x)))

CodePudding user response：

library(tidyverse)
library(rvest)

"https://reporting.standardbank.com/debt-investors/debt-securities/debt-securities/#1535044558978-07920fd6-5fa1" %>% 
  read_html() %>%  
  html_elements(".table a") %>% 
  html_attr("href")

CodePudding user response：

I have put together a general method using Selenium in R.

# Install and load the required libraries
library(rvest)
library(tidyverse)
library(RSelenium)

# Start a headless browser
rD <- rsDriver(port = 4445L, browser = "chrome", chromever = "108.0.5359.71")
remDr <- rD[["client"]]

# Go to the website and get the HTML content
url <- "https://www.nedbank.co.za/content/nedbank/desktop/gt/en/investor-relations/debt-investor/debt-investors-programme.html"
remDr$navigate(url)
html <- read_html(remDr$getPageSource()[[1]])

# Find all the links on the page
links <- html_nodes(html, "a")

# Extract the href attributes of the links
hrefs <- html_attr(links, "href")

# Keep only the links that end in ".pdf"
pdf_links <- hrefs[grepl(".pdf$", hrefs)]

# Print the PDF links
pdf_links

# Close the browser
remDr$close()
rD$server$stop()