I am trying to get the tables from this link: https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html but it's not working for me.
I can scrape game detail, scores, and scoring summary but I cannot get the team_stats.
Any help is greatly appreciated.
Thanks
library(tidyverse)
library(rvest)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
webpage <- read_html(url)
(team_names <- webpage %>%
html_nodes('div.scorebox strong a') %>%
html_text())
(scores <- webpage %>% html_nodes('div.score') %>% html_text())
(team_names <- setNames(team_names, c('away', 'home')))
(game_detail <- webpage %>%
html_nodes('#wrap') %>%
html_nodes('#content') %>%
html_node('h1') %>%
html_text()
)
(scores <- webpage %>%
html_nodes('#wrap') %>%
html_nodes('#content') %>%
html_nodes('.scorebox') %>%
html_nodes('div') %>%
html_nodes('.score') %>%
html_text2())
(scoring_summary <- webpage %>%
html_nodes('#wrap') %>%
html_nodes('#content') %>%
html_nodes('.table_container#div_scoring') %>%
html_table()
)
(team_stats <- webpage %>%
html_nodes('#wrap') %>%
html_nodes('#content') %>%
html_nodes(xpath = '//*[@id="div_team_stats"]') %>%
html_table()
)
CodePudding user response:
The tables are stored in the comments in the HTML code and are rendered by javascript.
Using my answer here: Web scraping tables on college basketball stats
See comments for details.
library(dplyr)
library(rvest)
library(xml2)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
web_page<- read_html(url)
#Only save and work with the body
body<-html_node(web_page,"body")
write_xml(body, "temp.xml")
#Find and remove comments
lines<-readLines("temp.xml")
lines<-lines[-grep("<!--", lines)]
lines<-lines[-grep("-->", lines)]
writeLines(lines, "temp2.xml")
#Read the file back in and process normally
body<-read_html("temp2.xml")
html_nodes(body, "table") %>% html_table()
CodePudding user response:
I have been able to get the content of the table with the following code :
library(RSelenium)
library(rvest)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate(url)
remDr$getPageSource()[[1]] %>% read_html() %>% html_table()
[[1]]
# A tibble: 2 x 3
X1 X2 X3
<chr> <int> <chr>
1 Duquesne 14 "Final"
2 Hawaii 24 ""
[[2]]
# A tibble: 2 x 7
`` `` `1` `2` `3` `4` Final
<chr> <chr> <int> <int> <int> <int> <int>
1 "via Sports Logos.net\n\t\t\tAbout logos" William & Mary 10 7 3 21 41
2 "via Sports Logos.net\n\t\t\tAbout logos" Charlotte 3 14 7 0 24
[[3]]
# A tibble: 11 x 6
Quarter Time Team Description `W&M` CHAR
<int> <chr> <chr> <chr> <int> <int>
1 1 12:17 W&M Ethan Chang 36 yard field goal 3 0
2 NA 5:19 CHAR Antonio Zita 32 yard field goal 3 3
3 NA 0:00 W&M Tyler Rose 31 yard pass from Darius Wilson (Ethan Chang kick) 10 3
4 2 0:00 W&M Donavyn Lester 22 yard run (Ethan Chang kick) 17 3
5 NA 6:34 CHAR Chavon McEachern 3 yard run (Antonio Zita kick) 17 10
6 NA 0:13 CHAR Xavier Williams 2 yard run (Antonio Zita kick) 17 17
7 3 4:30 W&M Ethan Chang 28 yard field goal 20 17
8 NA 2:18 CHAR Xavier Williams 67 yard run (Antonio Zita kick) 20 24
9 4 12:31 W&M Bronson Yoder 1 yard run (Ethan Chang kick) 27 24
10 NA 0:00 W&M Lachlan Pitts 65 yard pass from Darius Wilson (Ethan Chang kick) 34 24
11 NA 7:38 W&M Malachi Imoh 17 yard run (Ethan Chang kick) 41 24
[[4]]
# A tibble: 7 x 3
Stat `W&M` CHAR
<chr> <chr> <chr>
1 First Downs 25 22
2 Rush-Yds-TDs 47-303-3 37-131-3
3 Cmp-Att-Yd-TD-INT 13-19-256-2-0 17-31-248-0-0
4 Total Yards 559 379
5 Fumbles-Lost 0-0 0-0
6 Turnovers 0 0
7 Penalties-Yards 7-80 9-83
[[5]]
# A tibble: 3 x 11
`` `` Passing Passing Passing Passing Passing Passing Passing Passing Passing
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School Cmp Att Pct Yds Y/A AY/A TD Int Rate
2 Xavier Williams Charlotte 12 23 52.2 201 8.7 8.7 0 0 125.6
3 James Foster Charlotte 5 8 62.5 47 5.9 5.9 0 0 111.9
[[6]]
# A tibble: 10 x 14
`` `` Rushing Rushing Rushing Rushing Receiving Receiving Receiving Receiving Scrimmage Scrimmage Scrimmage Scrimmage
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School "Att" "Yds" "Avg" "TD" "Rec" "Yds" "Avg" "TD" Plays Yds Avg TD
2 Shadrick Byrd Charlotte "13" "46" "3.5" "0" "2" "5" "2.5" "0" 15 51 3.4 0
3 Chavon McEachern Charlotte "11" "30" "2.7" "1" "" "" "" "" 11 30 2.7 1
4 Xavier Williams Charlotte "10" "48" "4.8" "2" "" "" "" "" 10 48 4.8 2
5 James Foster Charlotte "3" "7" "2.3" "0" "" "" "" "" 3 7 2.3 0
6 Elijah Spencer Charlotte "" "" "" "" "5" "107" "21.4" "0" 5 107 21.4 0
7 Grant Dubose Charlotte "" "" "" "" "4" "67" "16.8" "0" 4 67 16.8 0
8 Victor Tucker Charlotte "" "" "" "" "4" "58" "14.5" "0" 4 58 14.5 0
9 Nolan Groulx Charlotte "" "" "" "" "1" "6" "6.0" "0" 1 6 6.0 0
10 Taylor Thompson Charlotte "" "" "" "" "1" "5" "5.0" "0" 1 5 5.0 0
[[7]]
# A tibble: 25 x 16
`` `` Tackles Tackles Tackles Tackles Tackles `Def Int` `Def Int` `Def Int` `Def Int` `Def Int` Fumbles Fumbles Fumbles Fumbles
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School Solo Ast Tot Loss Sk "Int" "Yds" "Avg" "TD" "PD" "FR" "Yds" "TD" "FF"
2 Wayne Jones Charlotte 5 6 11 0.0 0.0 "" "" "" "" "" "" "" "" ""
3 Jalar Holley Charlotte 2 4 6 0.0 0.0 "" "" "" "" "" "" "" "" ""
4 Davondre Robinson Charlotte 5 1 6 0.0 0.0 "" "" "" "" "" "" "" "" ""
5 Kofi Wardlow Charlotte 2 4 6 1.0 1.0 "" "" "" "" "" "" "" "" ""
6 Chase Monroe Charlotte 3 2 5 1.0 0.0 "" "" "" "" "" "" "" "" ""
7 Cam Burden Charlotte 3 1 4 1.0 0.0 "" "" "" "" "" "" "" "" ""
8 Geovonte' Howard Charlotte 2 2 4 0.0 0.0 "" "" "" "" "" "" "" "" ""
9 Solomon Rogers Charlotte 1 3 4 0.0 0.0 "" "" "" "" "" "" "" "" ""
10 Jordan Anderson Charlotte 0 3 3 0.0 0.0 "" "" "" "" "1" "" "" "" ""
# ... with 15 more rows
# i Use `print(n = ...)` to see more rows
[[8]]
# A tibble: 4 x 10
`` `` `Kick Ret` `Kick Ret` `Kick Ret` `Kick Ret` `Punt Ret` `Punt Ret` `Punt Ret` `Punt Ret`
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School "Ret" "Yds" "Avg" "TD" "Ret" "Yds" "Avg" "TD"
2 Henry Rutledge Charlotte "4" "72" "18.0" "0" "" "" "" ""
3 Shadrick Byrd Charlotte "2" "31" "15.5" "0" "" "" "" ""
4 Victor Tucker Charlotte "" "" "" "" "1" "5" "5.0" "0"
[[9]]
# A tibble: 3 x 12
`` `` Kicking Kicking Kicking Kicking Kicking Kicking Kicking Punting Punting Punting
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School "XPM" "XPA" "XP%" "FGM" "FGA" "FG%" "Pts" "Punts" "Yds" "Avg"
2 Antonio Zita Charlotte "3" "3" "100.0" "1" "2" "50.0" "6" "" "" ""
3 Bailey Rice Charlotte "" "" "" "" "" "" "" "4" "189" "47.3"
[[10]]
# A tibble: 4 x 15
`` `` Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Kicking Kicking `` `` ``
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Player School "Rush" "Rec" "Int" "FR" "PR" "KR" "Oth" "Tot" "XPM" "FGM" "2PM" "Sfty" Pts
2 Xavier Williams Charlotte "2" "" "" "" "" "" "" "2" "" "" "" "" 12
3 Chavon McEachern Charlotte "1" "" "" "" "" "" "" "1" "" "" "" "" 6
4 Antonio Zita Charlotte "" "" "" "" "" "" "" "" "3" "1" "" "" 6
Here is another approach :
library(RDCOMClient)
library(stringr)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(5)
doc <- IEApp$Document()
web_Obj <- doc$GetElementByID("team_stats")
table_Content <- web_Obj$innerHtml()
first_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"stat\">.*</th>"))
first_Col <- stringr::str_replace_all(first_Col, "(data-stat=\"stat\">)(.*)(</th>)", "\\2")
second_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"vis_stat\">.*</td><td class=\"center"))
second_Col <- stringr::str_replace_all(second_Col, "(data-stat=\"vis_stat\">)(.*)(</td><td class=\"center)", "\\2")
third_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"home_stat\">.*</td>"))
third_Col <- stringr::str_replace_all(third_Col, "(data-stat=\"home_stat\">)(.*)(</td>)", "\\2")
table <- cbind(first_Col, second_Col, third_Col)
table
first_Col second_Col third_Col
[1,] "Stat" "25" "22"
[2,] "First Downs" "47-303-3" "37-131-3"
[3,] "Rush-Yds-TDs" "13-19-256-2-0" "17-31-248-0-0"
[4,] "Cmp-Att-Yd-TD-INT" "559" "379"
[5,] "Total Yards" "0-0" "0-0"
[6,] "Fumbles-Lost" "0" "0"
[7,] "Turnovers" "7-80" "9-83"
[8,] "Penalties-Yards" "25" "22"