I would like to download the PDFs from this
# Load the packages
library(rvest)
library(tidyverse)
library(pdftools)
# Scrape the website
url <- "https://reporting.standardbank.com/debt-investors/debt-securities/debt-securities/"
html <- read_html(url)
# Extract the links to the PDFs
pdf_links <- html_nodes(html, "a") %>%
html_attr("href") %>%
str_extract("(?<=\\.pdf). ")
# Download the PDFs
pdf_links %>%
map(function(x) download.file(x, basename(x)))
CodePudding user response:
library(tidyverse)
library(rvest)
"https://reporting.standardbank.com/debt-investors/debt-securities/debt-securities/#1535044558978-07920fd6-5fa1" %>%
read_html() %>%
html_elements(".table a") %>%
html_attr("href")
CodePudding user response:
I have put together a general method using Selenium in R.
# Install and load the required libraries
library(rvest)
library(tidyverse)
library(RSelenium)
# Start a headless browser
rD <- rsDriver(port = 4445L, browser = "chrome", chromever = "108.0.5359.71")
remDr <- rD[["client"]]
# Go to the website and get the HTML content
url <- "https://www.nedbank.co.za/content/nedbank/desktop/gt/en/investor-relations/debt-investor/debt-investors-programme.html"
remDr$navigate(url)
html <- read_html(remDr$getPageSource()[[1]])
# Find all the links on the page
links <- html_nodes(html, "a")
# Extract the href attributes of the links
hrefs <- html_attr(links, "href")
# Keep only the links that end in ".pdf"
pdf_links <- hrefs[grepl(".pdf$", hrefs)]
# Print the PDF links
pdf_links
# Close the browser
remDr$close()
rD$server$stop()