Problem: I'm trying to scrape multiple tables, but am getting the message "It appears your browser may be outdated..." inside of my scraped tables.
Attempts to Fix: I tried adding a user_agent call inside my read_html() to bypass the issue, but it doesn't seem to change the end result.
Questions: How can I bypass an outdated browser with my user_agent call? Am I placing the user_agent call inside my function in the wrong location?
library(dplyr)
library(tidyverse)
library(janitor)
library(rvest)
library(magrittr)
library(purrr)
library(openxlsx)
#leaderboard links
df6 <- expand.grid(
tournament_id = c("the-american-express","wm-phoenix-open","farmers-insurance-open"),
year_id = c("2004", "2005", "2006")
) %>%
mutate(
links = paste0(
'https://www.pgatour.com/tournaments/',
tournament_id,
"/past-results.",
year_id,
'.html'
)
) %>%
as_tibble()
#Scrape function
get_info <- function(link, tournament) {
link %>%
read_html(, user_agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36") %>%
html_table() %>%
.[[1]] %>%
clean_names() %>%
mutate(tournament = tournament)
}
#retrieve data
test501 <- df6 %>%
mutate(tables = map2(links, tournament_id, possibly(get_info, otherwise = tibble())))
test501 <- test501 %>%
unnest(everything())
test501
CodePudding user response:
Inspect to see where the data actually comes from using browser dev tools network tab. You need a different url construction then some column and row cleaning. I have not cleaned absolutely everything but given a good number of examples
library(tidyverse)
library(janitor)
library(rvest)
# leaderboard links
df6 <- expand.grid(
tournament_id = c("the-american-express", "wm-phoenix-open", "farmers-insurance-open"),
year_id = c("2004", "2005", "2006")
) %>%
mutate(
links = paste0(
"https://www.pgatour.com/tournaments/",
tournament_id,
"/past-results/jcr:content/mainParsys/pastresults.selectedYear.",
year_id,
".html"
)
) %>%
as_tibble()
# Scrape function
get_info <- function(link, tournament) {
link %>%
read_html() %>%
html_element("[data-display-rounds]") %>%
html_table(trim = T) %>%
clean_names() %>%
mutate(tournament = tournament)
}
# retrieve data
test501 <- df6 %>%
mutate(tables = map2(links, tournament_id, possibly(get_info, otherwise = tibble())))
test501 <- test501 %>%
unnest(everything())
test501 <- filter(test501, !grepl("PLAYER", player)) %>%
mutate(across(starts_with("rounds"), ~ trimws(unlist(str_split(.x, " "))[1])),
)
test501$pos <- lapply(test501$pos, function(x) tail(unlist(str_split(x, ' ')) ,1))