I have been playing around with Rselenium and webscraping from a list of URLs. Naturally, I would want to combine the data from each URL I scrape into a dataframe. When I do that, the dataframe that is returned will have the data, along with miscellaneous things such as "checkStatus
", "statusClass
" etc. Its quite difficult to explain but I hope that the code will help to explain it better.
remDr <- remoteDriver( remoteServerAddr = "localhost",
port = 4444,
browserName = "chrome")
remDr$open()
URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")
# Webscrape function
ScrapeDF <- function(link_element){
#General Stats
link_element <- remDr$findElement(using = "css selector",".statappearances")
Appearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statwins")
Wins <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statlosses")
Losses <- as.character(link_element$getElementText())
#Defence Stats
link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
CleanSheet <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
Conceded <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
Tackles <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattackle_success")
SuccessfulTackle <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
LastManTackle <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
BlockedShots <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statinterception")
Interceptions <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
Clearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
HeadedClearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
ClearanceOffLine <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statball_recovery")
Recovery <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statduel_won")
DuelsWon <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statduel_lost")
DuelsLost <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statwon_contest")
Successful5050 <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataerial_won")
AerialWon <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
AerialLost <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statown_goals")
OwnGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
ErrorsToGoal <- as.character(link_element$getElementText())
#Team Play Stats
link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
Assists <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
Passes <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
PassperMatch <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
BigChanceCreated <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
Crosses <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
CrossAcc <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
ThroughBall <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
LongBall <- as.character(link_element$getElementText())
#Discipline Stats
link_element <- remDr$findElement(using = "css selector",".statyellow_card")
YelCard <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statred_card")
RedCard <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statfouls")
Fouls <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
Offside <- as.character(link_element$getElementText())
#Attack stats
link_element <- remDr$findElement(using = "css selector",".statgoals")
Goals <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
HeadedGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
RightFootGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
LeftFootGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
Woodwork <- as.character(link_element$getElementText())
DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
"Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
"HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
"LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
"YellowCard" = YelCard, "RedCard" = RedCard,
"Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
"Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
"Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
"AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
"Conceded" = Conceded, "Tackles" = Tackles,
"SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
"BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
"Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
"OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
"DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
"Successful50_50" = Successful5050, "AerialWon" = AerialWon,
"AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
"ErrorsToGoal" = ErrorsToGoal)
}
## For loop to webscrape
CompletePlayerData <- data.frame(matrix(nrow = 0,ncol = 0))
#looping function of scraping the stats for all the players
for (url in URL_list) {
remDr$navigate(url)
Sys.sleep(4)
Position <- remDr$findElement(using = "css selector",".info")
Text <- as.character(Position$getElementText())
if(Text == "Defender"){
saved_list <- lapply(Position, ScrapeDF)
} else {
Position <- remDr$findElement(using = "css selector",".info~ .info")
Text <- as.character(Position$getElementText())
if(Text == "Defender"){
saved_list <- lapply(Position, ScrapeDF)
}
}
CompletePlayerData <- bind_rows(CompletePlayerData, saved_list)
}
This will return a dataframe with 900 columns like this
checkError.Position ... checkStatus.Position ... nativeEvents.Appearance ...
Defender ... Defender ... 14 ...
So my questions here are:
- why does it return with so many columns
- is there a way to bind the data such that these columns appear correspondingly to "Position", "Appearance", "Goals" etc.?
I would like to apologise for the long code in advance
CodePudding user response:
library(RSelenium)
library(tidyverse)
remDr <- remoteDriver( remoteServerAddr = "localhost",
port = 4444,
browserName = "chrome")
remDr$open()
URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")
# Webscrape function
ScrapeDF <- function(link_element){
#General Stats
link_element <- remDr$findElement(using = "css selector",".statappearances")
Appearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statwins")
Wins <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statlosses")
Losses <- as.character(link_element$getElementText())
#Defence Stats
link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
CleanSheet <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
Conceded <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
Tackles <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattackle_success")
SuccessfulTackle <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
LastManTackle <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
BlockedShots <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statinterception")
Interceptions <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
Clearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
HeadedClearance <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
ClearanceOffLine <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statball_recovery")
Recovery <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statduel_won")
DuelsWon <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statduel_lost")
DuelsLost <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statwon_contest")
Successful5050 <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataerial_won")
AerialWon <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
AerialLost <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statown_goals")
OwnGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
ErrorsToGoal <- as.character(link_element$getElementText())
#Team Play Stats
link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
Assists <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
Passes <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
PassperMatch <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
BigChanceCreated <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
Crosses <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
CrossAcc <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
ThroughBall <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
LongBall <- as.character(link_element$getElementText())
#Discipline Stats
link_element <- remDr$findElement(using = "css selector",".statyellow_card")
YelCard <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statred_card")
RedCard <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statfouls")
Fouls <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
Offside <- as.character(link_element$getElementText())
#Attack stats
link_element <- remDr$findElement(using = "css selector",".statgoals")
Goals <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
HeadedGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
RightFootGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
LeftFootGoal <- as.character(link_element$getElementText())
link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
Woodwork <- as.character(link_element$getElementText())
DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
"Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
"HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
"LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
"YellowCard" = YelCard, "RedCard" = RedCard,
"Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
"Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
"Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
"AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
"Conceded" = Conceded, "Tackles" = Tackles,
"SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
"BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
"Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
"OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
"DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
"Successful50_50" = Successful5050, "AerialWon" = AerialWon,
"AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
"ErrorsToGoal" = ErrorsToGoal)
}
## For loop to webscrape
CompletePlayerData <- tibble()
#looping function of scraping the stats for all the players
for (url in URL_list) {
remDr$navigate(url)
Sys.sleep(4)
Position <- remDr$findElement(using = "css selector",".info")
Text <- as.character(Position$getElementText())
if(Text == "Defender"){
# Return an empty list if call fails
saved_list <- lapply(Position, possibly(ScrapeDF, list()))
}
new_data <- tibble(
Position = Position %>% list(),
Text = Text,
saved_list = saved_list %>% list()
)
CompletePlayerData <- bind_rows(CompletePlayerData, new_data)
}
CompletePlayerData %>%
select(saved_list) %>%
unnest(saved_list) %>%
unnest(saved_list) %>%
distinct(Position, Goals, Appearance)
Output:
# A tibble: 1 x 3
Position Appearance Goals
<fct> <fct> <fct>
1 Defender 14 0