I am trying to scrape data from this webpage (https://www.fplanalytics.com/history1213.html). I am able to scrape the data from the first page but once I try to go to the next page it keeps giving me back the same data. I have noticed it always retrieves the same URL.
Does anybody have a clues how to get data from the following pages?
import requests
import os
import shutil
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
import pandas as pd
from bs4 import BeautifulSoup
# create list for html years
years= list(range(1213,2122,101))
# import html into python
driver = webdriver.Chrome(
"C:/Users/aldi/Downloads/chromedriver.exe")
driver.get('https://www.fplanalytics.com/history1213.html')
driver.maximize_window()
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find('table', {'id':'data-table'})
#create empty dataframe and name columns
columns_names = ["player","team","position", "minutes", "goals", "assists", "cs", "tot pts", "bonus"]
df = pd.DataFrame(columns = columns_names)
#grab table in one page
#trs = table.find_all('tr')[1:]
#for row in trs:
# row_data = row.find_all('td')
# row = [td.text for td in row_data]
# length = len(df)
# df.loc[length] = row
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find('table', {'id':'data-table'})
trs = table.find_all('tr')[1:]
for row in trs:
row_data = row.find_all('td')
row = [td.text for td in row_data]
length = len(df)
df.loc[length] = row
try:
#grabs the url of the next page
next_page = soup.find('a', class_ = 'page-link').get('href')
next_page = 'https://www.fplanalytics.com/history1213.html' next_page
# driver.get(next_page)
except:
break
#Imports the next pages HTML into python
page = requests.get(next_page)
soup = BeautifulSoup(page.text, 'lxml')
CodePudding user response:
Data is being loaded dynamically in page, by Javascript handling the datatable. You can inspect the Network tab in Dev tools, and try to scrape that data endpoint directly. For example:
import requests
import pandas as pd
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Host': 's3.eu-central-1.amazonaws.com',
'Origin': 'https://www.fplanalytics.com',
'Referer': 'https://www.fplanalytics.com/',
'sec-ch-ua': '"Chromium";v="103", ".Not/A)Brand";v="99"',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36'
}
r = requests.get('https://s3.eu-central-1.amazonaws.com/fpl.data/db/history201213.json?_=1659263770681', headers=headers)
df = pd.DataFrame(r.json())
print(df)
This returns a dataframe with 422 rows × 17 columns:
name team position minutes goals assists cs yc rc saves bonus points gc og ps pm _row
0 [Baird] [FUL] [DEF] 1320 2 0 2 5 0 0 3 45 20 0 0 0 BairdFULDEF
1 [Riise] [FUL] [DEF] 2529 0 5 8 4 0 0 7 95 41 0 0 0 RiiseFULDEF
2 [Senderos] [FUL] [DEF] 1684 0 0 5 6 0 0 1 48 25 0 0 0 SenderosFULDEF
3 [Riether] [FUL] [DEF] 3043 1 6 7 4 0 0 11 109 53 0 0 0 RietherFULDEF
4 [Hughes] [FUL] [DEF] 2115 0 0 4 1 0 0 3 50 43 0 0 0 HughesFULDEF
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...