Home > Software engineering >  How to scrape only clickable links in a loop with Scrapy and Selenium
How to scrape only clickable links in a loop with Scrapy and Selenium

Time:02-12

I'm trying to scrape some info on tennis matches from a Javascript site using Scrapy and Selenium. The starting URLs are for pages that contain all the matches on a given date. The first task on each page is to make all the matches visible from behind some horizontal tabs - got this covered. The second task is to scrape the match pages that sit behind links that aren't present on the starting URL pages - a specific tag needs to be clicked.

I've found all these tags no problem and have a loop written that uses Selenium to click the tag and yields a Request after each iteration. The issue I'm having is that each time I click through on a link then the page changes and my lovely list of elements detaches itself from the DOM and I get a StaleElementReferenceException error. I understand why this happens but I'm struggling to come up with a solution.

Here's my code so far:

import datetime as dt
from dateutil.rrule import DAILY, rrule
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.http.response import Response
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

MATCHES_XPATH = "//span[@class='link sc-10gv6xe-4 eEAcym pointer']"
ELEMENT_TEST = "//span[@class='link sc-15d69aw-2 hhbGos']"


class ScraperS24(Spider):

    name = "scores24_scraper"

    custom_settings = {
        "USER_AGENT": "*",
        "LOG_LEVEL": "WARNING",
        "DOWNLOADER_MIDDLEWARES": {
            'scraper.polgara.middlewares.SeleniumMiddleware': 543,
        },
    }
    httperror_allowed_codes = [301]

    def __init__(self):
        dates = list(rrule(DAILY, dtstart=dt.datetime(2015, 1, 6), until=dt.datetime(2015, 1, 21)))
        self.start_urls = [f"https://scores24.live/en/tennis/{d.strftime('%Y-%m-%d')}" for d in dates]
        super().__init__()

    def parse(self, response: Response):
        print(f"Parsing date - {response.url}")
        driver = response.request.meta["driver"]
        tabs = driver.find_elements_by_xpath("//button[@class='hjlkds-7 khtDIT']")
        for t in tabs: driver.execute_script("arguments[0].click();", t)
        matches = driver.find_elements_by_xpath(MATCHES_XPATH)
        wait = WebDriverWait(driver, 20)
        for m in matches:
            driver.execute_script("arguments[0].click();", m)
            try:
                wait.until(EC.presence_of_element_located((By.XPATH, ELEMENT_TEST)))
            except TimeoutException:
                driver.back()
            else:
                url = str(driver.current_url)
                driver.back()
                yield Request(url, callback=self._parse_match)

    def _parse_match(self, response):
        print(f"Parsing match - {response.url}")


process = CrawlerProcess()
process.crawl(ScraperS24)
process.start()

And the Selenium middleware:

class SeleniumMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        middleware = cls()
        crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware

    def process_request(self, request: Request, spider: Spider):
        logger.debug(f"Selenium processing request - {request.url}")
        self.driver.get(request.url)
        request.meta.update({'driver': self.driver})
        return HtmlResponse(
            request.url,
            body=self.driver.page_source,
            encoding='utf-8',
            request=request,
        )

    def spider_opened(self, spider):
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        self.driver = webdriver.Firefox(
            options=options,
            executable_path=cn.GECKODRIVER_PATH,
        )

    def spider_closed(self, spider):
        self.driver.quit()

I've tried adapting the loop using the answer Scraping results

  • Related