Home > Software engineering >  Webscraping with rvest : https://www.sports-reference.com
Webscraping with rvest : https://www.sports-reference.com

Time:09-21

I am trying to get the tables from this link: https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html but it's not working for me.

I can scrape game detail, scores, and scoring summary but I cannot get the team_stats.

Any help is greatly appreciated.

Thanks

library(tidyverse)
library(rvest)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"

webpage <- read_html(url)

(team_names <- webpage %>% 
    html_nodes('div.scorebox strong a') %>%
    html_text())

(scores <- webpage %>% html_nodes('div.score') %>% html_text())
(team_names <- setNames(team_names, c('away', 'home')))



(game_detail <- webpage %>% 
    html_nodes('#wrap') %>% 
    html_nodes('#content') %>% 
    html_node('h1') %>% 
    html_text()
)


(scores <- webpage %>% 
    html_nodes('#wrap') %>% 
    html_nodes('#content') %>% 
    html_nodes('.scorebox') %>% 
    html_nodes('div') %>% 
    html_nodes('.score') %>% 
    html_text2())

(scoring_summary <- webpage %>% 
    html_nodes('#wrap') %>% 
    html_nodes('#content') %>% 
    html_nodes('.table_container#div_scoring') %>% 
    html_table()
)


(team_stats <- webpage %>% 
    html_nodes('#wrap') %>% 
    html_nodes('#content') %>% 
    html_nodes(xpath = '//*[@id="div_team_stats"]') %>% 
    html_table()
)

CodePudding user response:

The tables are stored in the comments in the HTML code and are rendered by javascript.
Using my answer here: Web scraping tables on college basketball stats

See comments for details.

library(dplyr)
library(rvest)
library(xml2)

url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"

web_page<- read_html(url)
#Only save and work with the body
body<-html_node(web_page,"body")
write_xml(body, "temp.xml")

#Find and remove comments
lines<-readLines("temp.xml")
lines<-lines[-grep("<!--", lines)]
lines<-lines[-grep("-->", lines)]
writeLines(lines, "temp2.xml")

#Read the file back in and process normally
body<-read_html("temp2.xml")
html_nodes(body, "table") %>% html_table()

CodePudding user response:

I have been able to get the content of the table with the following code :

library(RSelenium)
library(rvest)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate(url)

remDr$getPageSource()[[1]] %>% read_html() %>% html_table()

[[1]]
# A tibble: 2 x 3
  X1          X2 X3     
  <chr>    <int> <chr>  
1 Duquesne    14 "Final"
2 Hawaii      24 ""     

[[2]]
# A tibble: 2 x 7
  ``                                        ``               `1`   `2`   `3`   `4` Final
  <chr>                                     <chr>          <int> <int> <int> <int> <int>
1 "via Sports Logos.net\n\t\t\tAbout logos" William & Mary    10     7     3    21    41
2 "via Sports Logos.net\n\t\t\tAbout logos" Charlotte          3    14     7     0    24

[[3]]
# A tibble: 11 x 6
   Quarter Time  Team  Description                                                      `W&M`  CHAR
     <int> <chr> <chr> <chr>                                                            <int> <int>
 1       1 12:17 W&M   Ethan Chang 36 yard field goal                                       3     0
 2      NA 5:19  CHAR  Antonio Zita 32 yard field goal                                      3     3
 3      NA 0:00  W&M   Tyler Rose 31 yard pass from Darius Wilson (Ethan Chang kick)       10     3
 4       2 0:00  W&M   Donavyn Lester 22 yard run (Ethan Chang kick)                       17     3
 5      NA 6:34  CHAR  Chavon McEachern 3 yard run (Antonio Zita kick)                     17    10
 6      NA 0:13  CHAR  Xavier Williams 2 yard run (Antonio Zita kick)                      17    17
 7       3 4:30  W&M   Ethan Chang 28 yard field goal                                      20    17
 8      NA 2:18  CHAR  Xavier Williams 67 yard run (Antonio Zita kick)                     20    24
 9       4 12:31 W&M   Bronson Yoder 1 yard run (Ethan Chang kick)                         27    24
10      NA 0:00  W&M   Lachlan Pitts 65 yard pass from Darius Wilson (Ethan Chang kick)    34    24
11      NA 7:38  W&M   Malachi Imoh 17 yard run (Ethan Chang kick)                         41    24

[[4]]
# A tibble: 7 x 3
  Stat              `W&M`         CHAR         
  <chr>             <chr>         <chr>        
1 First Downs       25            22           
2 Rush-Yds-TDs      47-303-3      37-131-3     
3 Cmp-Att-Yd-TD-INT 13-19-256-2-0 17-31-248-0-0
4 Total Yards       559           379          
5 Fumbles-Lost      0-0           0-0          
6 Turnovers         0             0            
7 Penalties-Yards   7-80          9-83         

[[5]]
# A tibble: 3 x 11
  ``              ``        Passing Passing Passing Passing Passing Passing Passing Passing Passing
  <chr>           <chr>     <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
1 Player          School    Cmp     Att     Pct     Yds     Y/A     AY/A    TD      Int     Rate   
2 Xavier Williams Charlotte 12      23      52.2    201     8.7     8.7     0       0       125.6  
3 James Foster    Charlotte 5       8       62.5    47      5.9     5.9     0       0       111.9  

[[6]]
# A tibble: 10 x 14
   ``               ``        Rushing Rushing Rushing Rushing Receiving Receiving Receiving Receiving Scrimmage Scrimmage Scrimmage Scrimmage
   <chr>            <chr>     <chr>   <chr>   <chr>   <chr>   <chr>     <chr>     <chr>     <chr>     <chr>     <chr>     <chr>     <chr>    
 1 Player           School    "Att"   "Yds"   "Avg"   "TD"    "Rec"     "Yds"     "Avg"     "TD"      Plays     Yds       Avg       TD       
 2 Shadrick Byrd    Charlotte "13"    "46"    "3.5"   "0"     "2"       "5"       "2.5"     "0"       15        51        3.4       0        
 3 Chavon McEachern Charlotte "11"    "30"    "2.7"   "1"     ""        ""        ""        ""        11        30        2.7       1        
 4 Xavier Williams  Charlotte "10"    "48"    "4.8"   "2"     ""        ""        ""        ""        10        48        4.8       2        
 5 James Foster     Charlotte "3"     "7"     "2.3"   "0"     ""        ""        ""        ""        3         7         2.3       0        
 6 Elijah Spencer   Charlotte ""      ""      ""      ""      "5"       "107"     "21.4"    "0"       5         107       21.4      0        
 7 Grant Dubose     Charlotte ""      ""      ""      ""      "4"       "67"      "16.8"    "0"       4         67        16.8      0        
 8 Victor Tucker    Charlotte ""      ""      ""      ""      "4"       "58"      "14.5"    "0"       4         58        14.5      0        
 9 Nolan Groulx     Charlotte ""      ""      ""      ""      "1"       "6"       "6.0"     "0"       1         6         6.0       0        
10 Taylor Thompson  Charlotte ""      ""      ""      ""      "1"       "5"       "5.0"     "0"       1         5         5.0       0        

[[7]]
# A tibble: 25 x 16
   ``                ``        Tackles Tackles Tackles Tackles Tackles `Def Int` `Def Int` `Def Int` `Def Int` `Def Int` Fumbles Fumbles Fumbles Fumbles
   <chr>             <chr>     <chr>   <chr>   <chr>   <chr>   <chr>   <chr>     <chr>     <chr>     <chr>     <chr>     <chr>   <chr>   <chr>   <chr>  
 1 Player            School    Solo    Ast     Tot     Loss    Sk      "Int"     "Yds"     "Avg"     "TD"      "PD"      "FR"    "Yds"   "TD"    "FF"   
 2 Wayne Jones       Charlotte 5       6       11      0.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 3 Jalar Holley      Charlotte 2       4       6       0.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 4 Davondre Robinson Charlotte 5       1       6       0.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 5 Kofi Wardlow      Charlotte 2       4       6       1.0     1.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 6 Chase Monroe      Charlotte 3       2       5       1.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 7 Cam Burden        Charlotte 3       1       4       1.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 8 Geovonte' Howard  Charlotte 2       2       4       0.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
 9 Solomon Rogers    Charlotte 1       3       4       0.0     0.0     ""        ""        ""        ""        ""        ""      ""      ""      ""     
10 Jordan Anderson   Charlotte 0       3       3       0.0     0.0     ""        ""        ""        ""        "1"       ""      ""      ""      ""     
# ... with 15 more rows
# i Use `print(n = ...)` to see more rows

[[8]]
# A tibble: 4 x 10
  ``             ``        `Kick Ret` `Kick Ret` `Kick Ret` `Kick Ret` `Punt Ret` `Punt Ret` `Punt Ret` `Punt Ret`
  <chr>          <chr>     <chr>      <chr>      <chr>      <chr>      <chr>      <chr>      <chr>      <chr>     
1 Player         School    "Ret"      "Yds"      "Avg"      "TD"       "Ret"      "Yds"      "Avg"      "TD"      
2 Henry Rutledge Charlotte "4"        "72"       "18.0"     "0"        ""         ""         ""         ""        
3 Shadrick Byrd  Charlotte "2"        "31"       "15.5"     "0"        ""         ""         ""         ""        
4 Victor Tucker  Charlotte ""         ""         ""         ""         "1"        "5"        "5.0"      "0"       

[[9]]
# A tibble: 3 x 12
  ``           ``        Kicking Kicking Kicking Kicking Kicking Kicking Kicking Punting Punting Punting
  <chr>        <chr>     <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
1 Player       School    "XPM"   "XPA"   "XP%"   "FGM"   "FGA"   "FG%"   "Pts"   "Punts" "Yds"   "Avg"  
2 Antonio Zita Charlotte "3"     "3"     "100.0" "1"     "2"     "50.0"  "6"     ""      ""      ""     
3 Bailey Rice  Charlotte ""      ""      ""      ""      ""      ""      ""      "4"     "189"   "47.3" 

[[10]]
# A tibble: 4 x 15
  ``               ``        Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Touchdowns Kicking Kicking ``    ``     ``   
  <chr>            <chr>     <chr>      <chr>      <chr>      <chr>      <chr>      <chr>      <chr>      <chr>      <chr>   <chr>   <chr> <chr>  <chr>
1 Player           School    "Rush"     "Rec"      "Int"      "FR"       "PR"       "KR"       "Oth"      "Tot"      "XPM"   "FGM"   "2PM" "Sfty" Pts  
2 Xavier Williams  Charlotte "2"        ""         ""         ""         ""         ""         ""         "2"        ""      ""      ""    ""     12   
3 Chavon McEachern Charlotte "1"        ""         ""         ""         ""         ""         ""         "1"        ""      ""      ""    ""     6    
4 Antonio Zita     Charlotte ""         ""         ""         ""         ""         ""         ""         ""         "3"     "1"     ""    ""     6    

Here is another approach :

library(RDCOMClient)
library(stringr)
url <- "https://www.sports-reference.com/cfb/boxscores/2022-09-02-charlotte.html"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(5)
doc <- IEApp$Document()

web_Obj <- doc$GetElementByID("team_stats")
table_Content <- web_Obj$innerHtml()
first_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"stat\">.*</th>"))
first_Col <- stringr::str_replace_all(first_Col, "(data-stat=\"stat\">)(.*)(</th>)", "\\2")

second_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"vis_stat\">.*</td><td class=\"center"))
second_Col <- stringr::str_replace_all(second_Col, "(data-stat=\"vis_stat\">)(.*)(</td><td class=\"center)", "\\2")

third_Col <- unlist(stringr::str_extract_all(table_Content, "data-stat=\"home_stat\">.*</td>"))
third_Col <- stringr::str_replace_all(third_Col, "(data-stat=\"home_stat\">)(.*)(</td>)", "\\2")

table <- cbind(first_Col, second_Col, third_Col)
table

 first_Col           second_Col      third_Col      
[1,] "Stat"              "25"            "22"           
[2,] "First Downs"       "47-303-3"      "37-131-3"     
[3,] "Rush-Yds-TDs"      "13-19-256-2-0" "17-31-248-0-0"
[4,] "Cmp-Att-Yd-TD-INT" "559"           "379"          
[5,] "Total Yards"       "0-0"           "0-0"          
[6,] "Fumbles-Lost"      "0"             "0"            
[7,] "Turnovers"         "7-80"          "9-83"         
[8,] "Penalties-Yards"   "25"            "22"  
  • Related