Home > Software design >  Scrape data from different pages with same url
Scrape data from different pages with same url

Time:08-03

I am trying to scrape data from this webpage (https://www.fplanalytics.com/history1213.html). I am able to scrape the data from the first page but once I try to go to the next page it keeps giving me back the same data. I have noticed it always retrieves the same URL.

Does anybody have a clues how to get data from the following pages?


import requests
import os
import shutil
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
import pandas as pd
from bs4 import BeautifulSoup

#  create list for html years
years= list(range(1213,2122,101))

# import html into python
driver = webdriver.Chrome(
   "C:/Users/aldi/Downloads/chromedriver.exe")
driver.get('https://www.fplanalytics.com/history1213.html')
driver.maximize_window()

soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find('table', {'id':'data-table'})

#create empty dataframe and name columns
columns_names = ["player","team","position", "minutes", "goals", "assists", "cs", "tot pts", "bonus"]
df = pd.DataFrame(columns = columns_names)

#grab table in one page
#trs = table.find_all('tr')[1:]
#for row in trs:
#    row_data = row.find_all('td')
#    row = [td.text for td in row_data]
#    length = len(df)
#    df.loc[length] = row
    
    
while True: 
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', {'id':'data-table'})
    trs = table.find_all('tr')[1:]
    for row in trs:
        row_data = row.find_all('td')
        row = [td.text for td in row_data]
        length = len(df)
        df.loc[length] = row
    try:
        #grabs the url of the next page
        next_page = soup.find('a', class_ = 'page-link').get('href')
        next_page = 'https://www.fplanalytics.com/history1213.html' next_page
#        driver.get(next_page)
    except:
        break
    
    #Imports the next pages HTML into python
    page = requests.get(next_page)
    soup = BeautifulSoup(page.text, 'lxml')

CodePudding user response:

Data is being loaded dynamically in page, by Javascript handling the datatable. You can inspect the Network tab in Dev tools, and try to scrape that data endpoint directly. For example:

import requests
import pandas as pd

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Host': 's3.eu-central-1.amazonaws.com',
'Origin': 'https://www.fplanalytics.com',
'Referer': 'https://www.fplanalytics.com/',
'sec-ch-ua': '"Chromium";v="103", ".Not/A)Brand";v="99"',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36'
}
r = requests.get('https://s3.eu-central-1.amazonaws.com/fpl.data/db/history201213.json?_=1659263770681', headers=headers)
df = pd.DataFrame(r.json())
print(df)

This returns a dataframe with 422 rows × 17 columns:

name    team    position    minutes goals   assists cs  yc  rc  saves   bonus   points  gc  og  ps  pm  _row
0   [Baird] [FUL]   [DEF]   1320    2   0   2   5   0   0   3   45  20  0   0   0   BairdFULDEF
1   [Riise] [FUL]   [DEF]   2529    0   5   8   4   0   0   7   95  41  0   0   0   RiiseFULDEF
2   [Senderos]  [FUL]   [DEF]   1684    0   0   5   6   0   0   1   48  25  0   0   0   SenderosFULDEF
3   [Riether]   [FUL]   [DEF]   3043    1   6   7   4   0   0   11  109 53  0   0   0   RietherFULDEF
4   [Hughes]    [FUL]   [DEF]   2115    0   0   4   1   0   0   3   50  43  0   0   0   HughesFULDEF
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
  • Related