i am trying to scrape flight info from https://www.flightstats.com/v2/flight-tracker/departures/LHR/
I am using selenium to load the page a click the necessary buttons such as "close cookie" and select the "hide codeshares". I am able to scrape the first page but not the subsequent ones. I use selenium and a while loop to click next page whilst pagenumber < lastpage to present the data but i am unable to scrape it.
I have tried looking at the network/xhr to see if any additional requests are made when clicking through the pages and i didnt see any, also the url does not change.
Thank you in advance! Any help is greatly appreciated.
to clarify, it is the time_frame_scrape() function that handles clicking through the pages and scraping the information. The table div and class_ name remain the same whist moving through the pagination but the scraper doesnt scrape the info past the 1st page.
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import selenium
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
import pandas as pd
from selenium.webdriver.support.select import Select
info = []
def get_table_info():
driver = webdriver.Chrome("/home/jamie/Desktop/chromedriver")
driver.implicitly_wait(5)
driver.get('https://www.flightstats.com/v2/flight-tracker/departures/LHR/')
time.sleep(2)
# disable cookie alert
python_button = driver.find_element(By.XPATH,'//*[@id="onetrust-close-btn-container"]/button')
python_button.click() #click x button
time.sleep(2)
# Get button and click it
python_button = driver.find_element(By.XPATH,'//*[@id="__next"]/div/section/div/div[2]/div[1]/div[1]/div/div')
python_button.click() #click hide codeshare
time.sleep(1)
# handover from selenium to BS
soup = BeautifulSoup(driver.page_source, features="lxml")
num_pages_fetch = soup.find('div', class_='pagination__PaginationContainer-sc-1515b5x-0 dUhdWi').text
num_pages = int(num_pages_fetch[-3])
print(num_pages_fetch)
print(type(num_pages))
print(num_pages)
# select the next timeframe to scrape
next_time_frame = Select(driver.find_element(By.XPATH,'//*[@id="__next"]/div/section/div/div[1]/div/div[2]/div/form/div[1]/div[2]/div[2]/div[2]/div/div/select'))
next_time_frame.select_by_visible_text("06:00-12:00")
time.sleep(1)
refine_button = driver.find_element(By.XPATH,'//*[@id="__next"]/div/section/div/div[1]/div/div[2]/div/form/div[2]/button/span')
refine_button.click()
time.sleep(5)
def click_next():
next_page = driver.find_element(By.XPATH,'//*[@id="__next"]/div/section/div/div[2]/div[1]/div[3]/div/div/div[12]/span')
next_page.click()
# Scraping every page within selected timeframe
def time_frame_scrape():
counter = 0
while counter <= num_pages:
table = soup.find('div', class_='table__TableContainer-sc-1x7nv9w-5 fOHnRO')
for item in table:
h2_headings = item.find_all('h2')
for h in h2_headings[1:]:
info.append(h.string)
print(info)
click_next()
table = soup.find('div', class_='table__TableContainer-sc-1x7nv9w-5 fOHnRO')
time.sleep(2)
counter = 1
# first element wasnt retrieved from above so this line gets the first element/flight number
first = table.find('h2').text
info.insert(0, first)
time_frame_scrape()
```
CodePudding user response:
Here is a less complex solution, and an order of magnitude faster, to retrieve the data you're looking for:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
df_list = []
r = requests.get('https://www.flightstats.com/v2/flight-tracker/departures/LHR/')
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.select('script')
for x in scripts:
if '__NEXT_DATA__ = ' in x.text:
json_string = x.text.split('__NEXT_DATA__ = ')[1].split('module={}')[0].strip()
json_flight_obj = json.loads(json_string)
actual_flights = json_flight_obj['props']['initialState']['flightTracker']['route']['flights']
for flight in actual_flights:
departureTime = flight['departureTime']['time24']
arrivalTime = flight['arrivalTime']['time24']
carrier = flight['carrier']['name']
flightNumber = flight['carrier']['flightNumber']
operatedBy = flight['operatedBy']
url = flight['url']
airport_fs = flight['airport']['fs']
airport_city = flight['airport']['city']
df_list.append((departureTime, arrivalTime, carrier, flightNumber, operatedBy, url, airport_fs, airport_city))
df = pd.DataFrame(df_list, columns = ['departureTime', 'arrivalTime', 'carrier', 'flightNumber', 'operatedBy', 'url', 'airport_fs', 'airport_city'])
df
This returns a dataframe with 735 rows × 8 columns:
departureTime arrivalTime carrier flightNumber operatedBy url airport_fs airport_city
0 14:25 17:15 American Airlines 91 None /flight-tracker/AA/91?year=2022&month=7&date=25&flightId=1102208979 ORD Chicago
1 14:25 17:15 Finnair 5789 Operated by American Airlines 91 /flight-tracker/AY/5789?year=2022&month=7&date=25&flightId=1102208979 ORD Chicago
2 14:25 17:15 British Airways 1546 Operated by American Airlines 91 /flight-tracker/BA/1546?year=2022&month=7&date=25&flightId=1102208979 ORD Chicago
3 14:25 17:15 Gulf Air 4128 Operated by American Airlines 91 /flight-tracker/GF/4128?year=2022&month=7&date=25&flightId=1102208979 ORD Chicago
4 14:25 17:15 Iberia 4396 Operated by American Airlines 91 /flight-tracker/IB/4396?year=2022&month=7&date=25&flightId=1102208979 ORD Chicago
... ... ... ... ... ... ... ... ...