I have a website to scrap, the pages contains a list of companies, and many pages (11). My script click on each company in the list and extract information. The problem that i scrap just the first page of the list, i want to browse all pages to get all informations. So this is my code :
import logging
import time
from datetime import datetime
from time import strftime, localtime
import scrapy
from scrapy import Selector
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from SaudiExchange.Config.AppSettings import AppSettings
from SaudiExchange.Repositories.AwsS3Repository import AwsS3Repository
class SaudiExchangeSpider(scrapy.Spider):
name = 'saudi_exchange'
logger = logging.getLogger("test_Launcher")
settings = AppSettings(logger)
awss3repository = AwsS3Repository(logger, settings)
base_url = 'https://www.saudiexchange.sa'
start_urls = ['https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory?'
'locale=en']
today = f'output/Saudi_Exchange {strftime("%Y-%m-%d %H-%M-%S", localtime())}.json'
if not settings.RUNNING_FROM_AWS:
custom_settings = {
'FEED_URI': 'Output/' name datetime.today().strftime('%y%m%d') '.json',
'FEED_FORMAT': 'json'
}
browser = webdriver.Chrome(executable_path=ChromeDriverManager().install())
def selectCheckBox(self, checkboxValue):
try:
checkBox = self.browser.find_element(By.XPATH, f'//*[@value="{checkboxValue}"]')
checkBox.click()
except NoSuchElementException as e:
time.sleep(1)
def parse(self, no_response):
self.browser.get(
'https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory?')
time.sleep(2)
response = Selector(text=self.browser.page_source)
all_listings = response.xpath('//*[@id="companiesListTable"]/tbody/tr/td[2]/a/@href').getall()
for detail_page in all_listings:
self.browser.get(no_response.urljoin(detail_page))
time.sleep(2)
self.browser.refresh()
self.browser.find_element(By.XPATH, '//*[@id="statementsTab"]').click()
time.sleep(2)
self.browser.refresh()
time.sleep(2)
response = Selector(text=self.browser.page_source)
Test_pdfs = response.css('[id="factSheetTable"] tbody tr')
if Test_pdfs:
time.sleep(1)
else:
self.browser.find_element(By.XPATH, '//*[@id="financialStatementsTab"]').click()
time.sleep(5)
self.browser.find_element(By.XPATH,
'//*[@id="chart_sub_tab9"]/div/table/tbody/tr[1]/td[3]/a[1]').click()
time.sleep(3)
"""filingInformationCheckBox = self.browser.find_element(By.XPATH, '//*[@value="FilingInformation"]')
if filingInformationCheckBox is not None:
filingInformationCheckBox.click()"""
self.selectCheckBox("FilingInformation")
self.selectCheckBox("StatementOfFinancialPositionCurrentNonCurrent")
self.selectCheckBox("StatementOfIncomeFunctionOfExpense")
self.selectCheckBox("StatementOfIncomeNatureOfExpense")
self.selectCheckBox("StatementOfCashFlowsIndirectMethod")
self.selectCheckBox("StatementOfCashFlowsIndirectMethodShareholdersOperations")
self.selectCheckBox("StatementOfOtherComprehensiveIncomeBeforeTaxShareholdersOperations")
self.selectCheckBox("StatementOfFinancialPositionOrderOfLiquidity")
"""
self.browser.find_element(By.XPATH, '//*[@value="FilingInformation"]').click()
self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfCash")]').click()
self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfFinancial")]').click()
self.browser.find_element(By.XPATH, '//*[contains(@value, "StatementOfIncome")]').click()"""
time.sleep(2)
self.browser.find_element(By.XPATH, '//*[@value="submit"]').click()
response = Selector(text=self.browser.page_source)
Results = list()
for data in response.xpath('/html/body/div[4]/div/table/tbody[2]/tr'):
key = data.css('td:nth-child(1) span::text').get()
value = data.css('td:nth-child(2)::text').get()
if key == " Name of reporting entity":
name_company = value
if key == " Company symbol code| ISIN code":
symbol = value
if key == " Description of presentation currency":
currency = value
if key == " Level of rounding used in financial statements":
val = value
single = {
"name_company": name_company,
"symbol": symbol,
"Value_in": val,
"Currency": currency
}
Results.append(single)
Financial_Data = list()
first_year1 = response.xpath('/html/body/div[6]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
second_year1 = response.xpath('/html/body/div[6]/div/table/tbody[1]/tr[1]/th[3]/text()').get()
for data in response.xpath('/html/body/div[6]/div/table/tbody[2]/tr'):
key = data.css('td:nth-child(1) span::text').get()
val_1 = data.css('td:nth-child(2)::text').get()
val_2 = data.css('td:nth-child(3)::text').get()
if key == " Total assets" or key == " Total liabilities" or key == " Total equity" or key == " Total liabilities and equity":
single = {
"key": key.replace(' ', ''),
"Values": [
{"Key": first_year1[0:4], "Value": val_1},
{"Key": second_year1[0:4], "Value": val_2}
]
}
Financial_Data.append(single)
first_year2 = response.xpath('/html/body/div[7]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
second_year2 = response.xpath('/html/body/div[7]/div/table/tbody[1]/tr[1]/th[3]/text()').get()
for data in response.xpath('/html/body/div[7]/div/table/tbody[2]/tr'):
key = data.css('td:nth-child(1) span::text').get()
val_1 = data.css('td:nth-child(2)::text').get()
val_2 = data.css('td:nth-child(3)::text').get()
if key == " Profit (loss), attributable to equity holders of parent company":
key = 'Net Income'
single = {
"key": key.replace(' ', ''),
"Values": [
{"Key": first_year2[0:4], "Value": val_1},
{"Key": second_year2[0:4], "Value": val_2}
]
}
Financial_Data.append(single)
first_year3 = response.xpath('/html/body/div[10]/div/table/tbody[1]/tr[1]/th[2]/text()').get()
second_year3 = response.xpath('/html/body/div[10]/div/table/tbody[1]/tr[1]/th[3]/text()').get()
for data in response.xpath('/html/body/div[10]/div/table/tbody[2]/tr'):
key = data.css('td:nth-child(1) span::text').get()
val_1 = data.css('td:nth-child(2)::text').get()
val_2 = data.css('td:nth-child(3)::text').get()
if key == " Net cash flows from (used in) operating activities" or key == " Net cash flows from (used in) investing activities" or key == " Net cash flows from (used in) financing activities" or key == " Net increase (decrease) in cash and cash equivalents":
single = {
"key": key.replace(' ', ''),
"Values": [
{"Key": first_year3[0:4], "Value": val_1},
{"Key": second_year3[0:4], "Value": val_2}
]
}
Financial_Data.append(single)
yield {
"Results": Results,
"Financial Data": Financial_Data
}
element = self.browser.find_element(By.CSS_SELECTOR, "a#pageing_next")
element.click()
When i run the code i got this error :
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"a#pageing_next"}
CodePudding user response:
If you inspect the a
element with id pageing_next
in browser tools. You will see there is no pagination and no attribute href. It's just a dummy element that is loaded by the script element. This means you need to iterate over the loop of rows and columns of the table to extract the entire list.
Here's a link to understand how you can handle web table in selenium and Handling dynamic web tables in selenium
Hope this helps...
CodePudding user response:
Actually, data is also generating from API calls json response.An example is given below:
from scrapy.crawler import CrawlerProcess
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
url = 'https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet'
yield scrapy.Request(
url=url,
callback=self.parse,
method="GET"
)
def parse(self, response):
data = response.json()
for p in data:
item = dict()
item['name_company'] = p["companyName"]
item['symbol'] = p["symbol"]
yield item
if __name__ == "__main__":
process = CrawlerProcess(TestSpider)
process.crawl()
process.start()
Output:
{'name_company': 'RHC Sukuk - S6', 'symbol': '1249'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'Almarai Company Sukuk - Tranche 5', 'symbol': '6011'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'STC Sukuk', 'symbol': '7011'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'BSF AT1 Sukuk', 'symbol': '1053'}
2022-06-16 19:40:58 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.saudiexchange.sa/tadawul.eportal.theme.helper/ThemeSearchUtilityServlet>
{'name_company': 'AlBilad Sukuk 2', 'symbol': '1144'}
2022-06-16 19:40:58 [scrapy.core.engine] INFO: Closing spider (finished)
2022-06-16 19:40:58 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 21.667359,
'item_scraped_count': 651,