Home > database >  how to browse pages in a website using selenium
how to browse pages in a website using selenium

Time:06-17

I have a website to scrap, the pages contains a list of companies, and many pages (11). My script click on each company in the list and extract information. The problem that i scrap just the first page of the list, i want to browse all pages to get all informations. So this is my code :

import logging
import time
from datetime import datetime

from time import strftime, localtime

import scrapy

from scrapy import Selector
from selenium.common import NoSuchElementException

from selenium.webdriver.common.by import By
from selenium import webdriver

from webdriver_manager.chrome import ChromeDriverManager

from SaudiExchange.Config.AppSettings import AppSettings
from SaudiExchange.Repositories.AwsS3Repository import AwsS3Repository


class SaudiExchangeSpider(scrapy.Spider):
    name = 'saudi_exchange'
    logger = logging.getLogger("test_Launcher")
    settings = AppSettings(logger)
    awss3repository = AwsS3Repository(logger, settings)
    base_url = 'https://www.saudiexchange.sa'
    start_urls = ['https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory?'
                  'locale=en']

    today = f'output/Saudi_Exchange {strftime("%Y-%m-%d %H-%M-%S", localtime())}.json'

    if not settings.RUNNING_FROM_AWS:
        custom_settings = {

            'FEED_URI': 'Output/'   name   datetime.today().strftime('%y%m%d')   '.json',
            'FEED_FORMAT': 'json'
        }
    browser = webdriver.Chrome(executable_path=ChromeDriverManager().install())

    def selectCheckBox(self, checkboxValue):

        try:
            checkBox = self.browser.find_element(By.XPATH, f'//*[@value="{checkboxValue}"]')
            checkBox.click()
        except NoSuchElementException as e:
            time.sleep(1)

    def parse(self, no_response):

        self.browser.get(
            'https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory?')
        time.sleep(2)
        response = Selector(text=self.browser.page_source)

        all_listings = response.xpath('//*[@id="companiesListTable"]/tbody/tr/td[2]/a/@href').getall()

        for detail_page in all_listings:
            self.browser.get(no_response.urljoin(detail_page))
            time.sleep(2)
            self.browser.refresh()
            self.browser.find_element(By.XPATH, '//*[@id="statementsTab"]').click()
            time.sleep(2)
            self.browser.refresh()
            time.sleep(2)
            response = Selector(text=self.browser.page_source)
            Test_pdfs = response.css('[id="factSheetTable"] tbody tr')
            if Test_pdfs:
                time.sleep(1)
            else:
                self.browser.find_element(By.XPATH, '//*[@id="financialStatementsTab"]').click()
                time.sleep(5)
                self.browser.find_element(By.XPATH,
                                          '//*[@id="chart_sub_tab9"]/div/table/tbody/tr[1]/td[3]/a[1]').click()
                time.sleep(3)

                """filingInformationCheckBox = self.browser.find_element(By.XPATH, '//*[@value="FilingInformation"]')
                if filingInformationCheckBox is not None:
                    filingInformationCheckBox.click()"""

                self.selectCheckBox("FilingInformation")
                self.selectCheckBox("StatementOfFinancialPositionCurrentNonCurrent")
                self.selectCheckBox("StatementOfIncomeFunctionOfExpense")
                self.selectCheckBox("StatementOfIncomeNatureOfExpense")
                self.selectCheckBox("StatementOfCashFlowsIndirectMethod")
                self.selectCheckBox("StatementOfCashFlowsIndirectMethodShareholdersOperations")
                self.selectCheckBox("StatementOfOtherComprehensiveIncomeBeforeTaxShareholdersOperations")
                self.selectCheckBox("StatementOfFinancialPositionOrderOfLiquidity")

                """
                self.browser.find_element(By.XPATH, '//*[@value="FilingInformation"]').click()
    
                self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfCash")]').click()
    
                self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfFinancial")]').click()
    
                self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfIncome")]').click()"""

                time.sleep(2)

                self.browser.find_element(By.XPATH, '//*[@value="submit"]').click()

                response = Selector(text=self.browser.page_source)

                Results = list()
                for data in response.xpath('/html/body/div[4]/div/table/tbody[2]/tr'):
                    key = data.css('td:nth-child(1) span::text').get()
                    value = data.css('td:nth-child(2)::text').get()

                    if key == "      Name of reporting entity":
                        name_company = value

                    if key == "      Company symbol code| ISIN code":
                        symbol = value
                    if key == "      Description of presentation currency":
                        currency = value
                    if key == "      Level of rounding used in financial statements":
                        val = value

                        single = {
                            "name_company": name_company,
                            "symbol": symbol,
                            "Value_in": val,
                            "Currency": currency

                        }
                        Results.append(single)

                Financial_Data = list()

                first_year1 = response.xpath('/html/body/div[6]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
                second_year1 = response.xpath('/html/body/div[6]/div/table/tbody[1]/tr[1]/th[3]/text()').get()
                for data in response.xpath('/html/body/div[6]/div/table/tbody[2]/tr'):
                    key = data.css('td:nth-child(1) span::text').get()
                    val_1 = data.css('td:nth-child(2)::text').get()
                    val_2 = data.css('td:nth-child(3)::text').get()

                    if key == "      Total assets" or key == "         Total liabilities" or key == "         Total equity" or key == "      Total liabilities and equity":
                        single = {
                            "key": key.replace(' ', ''),
                            "Values": [
                                {"Key": first_year1[0:4], "Value": val_1},
                                {"Key": second_year1[0:4], "Value": val_2}
                            ]
                        }
                        Financial_Data.append(single)
                first_year2 = response.xpath('/html/body/div[7]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
                second_year2 = response.xpath('/html/body/div[7]/div/table/tbody[1]/tr[1]/th[3]/text()').get()

                for data in response.xpath('/html/body/div[7]/div/table/tbody[2]/tr'):
                    key = data.css('td:nth-child(1) span::text').get()
                    val_1 = data.css('td:nth-child(2)::text').get()
                    val_2 = data.css('td:nth-child(3)::text').get()

                    if key == "      Profit (loss), attributable to equity holders of parent company":
                        key = 'Net Income'
                        single = {
                            "key": key.replace(' ', ''),
                            "Values": [
                                {"Key": first_year2[0:4], "Value": val_1},
                                {"Key": second_year2[0:4], "Value": val_2}
                            ]
                        }
                        Financial_Data.append(single)
                first_year3 = response.xpath('/html/body/div[10]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
                second_year3 = response.xpath('/html/body/div[10]/div/table/tbody[1]/tr[1]/th[3]/text()').get()
                for data in response.xpath('/html/body/div[10]/div/table/tbody[2]/tr'):
                    key = data.css('td:nth-child(1) span::text').get()
                    val_1 = data.css('td:nth-child(2)::text').get()
                    val_2 = data.css('td:nth-child(3)::text').get()

                    if key == "         Net cash flows from (used in) operating activities" or key == "         Net cash flows from (used in) investing activities" or key == "         Net cash flows from (used in) financing activities" or key == "      Net increase (decrease) in cash and cash equivalents":
                        single = {
                            "key": key.replace(' ', ''),
                            "Values": [
                                {"Key": first_year3[0:4], "Value": val_1},
                                {"Key": second_year3[0:4], "Value": val_2}
                            ]
                        }
                        Financial_Data.append(single)

                yield {

                    "Results": Results,
                    "Financial Data": Financial_Data
                }
        element = self.browser.find_element(By.CSS_SELECTOR, "a#pageing_next")
        element.click()

When i run the code i got this error :

selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"a#pageing_next"}

CodePudding user response:

If you inspect the a element with id pageing_next in browser tools. You will see there is no pagination and no attribute href. It's just a dummy element that is loaded by the script element. This means you need to iterate over the loop of rows and columns of the table to extract the entire list.

Here's a link to understand how you can handle web table in selenium and Handling dynamic web tables in selenium

Hope this helps...

CodePudding user response:

Actually, data is also generating from API calls json response.An example is given below:

from scrapy.crawler import CrawlerProcess
import scrapy
class TestSpider(scrapy.Spider):
    name = 'test'
    def start_requests(self):
        url = 'https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet'
        yield scrapy.Request(
            url=url,
            callback=self.parse,
            method="GET"
            )
    def parse(self, response):
        data = response.json()
       
        for p in data:
            item = dict()
            item['name_company'] = p["companyName"]
            item['symbol'] = p["symbol"]
            yield item

if __name__ == "__main__":
    process = CrawlerProcess(TestSpider)
    process.crawl()
    process.start()

Output:

{'name_company': 'RHC Sukuk - S6', 'symbol': '1249'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'Almarai Company Sukuk - Tranche 5', 'symbol': '6011'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'STC  Sukuk', 'symbol': '7011'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'BSF AT1 Sukuk', 'symbol': '1053'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'AlBilad Sukuk 2', 'symbol': '1144'}
2022-06-16 19:40:58 [scrapy.core.engine] INFO: Closing spider (finished)
2022-06-16 19:40:58 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 21.667359,
 'item_scraped_count': 651,
  • Related