Selenium requests works but not using scrapy-selenium. The page loads and I get a 200 response from the website, but I get no error as it isn't yielding any output.
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
table = r.xpath('//*[@id="table-apps"]/tbody')
rows = table.css('tr[class= "app"]')[0:2]
for element in rows:
info_link = "https://steamdb.info" element.css('::attr(href)').get()
name = element.css('a ::text').get()
yield {"Name": name, "Link": info_link}
CodePudding user response:
Actually, SeleniumRequest with scrapy is not always perfect. The same selement selection is worwking selenium with bs4 but getting empty output like you along with scrapy.
Scrapy-SeleniumRequest not working
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
rows = r.css('table#table-apps tbody tr')
for element in rows:
info_link = "https://steamdb.info" element.css('td:nth-child(3) > a::attr(href)').get()
name = element.css('td:nth-child(3) > a::text').get()
yield {"Name": name, "Link": info_link}
Selenium with bs4 is working fine:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
driver.get("https://steamdb.info/graph/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tr in soup.select('table#table-apps tbody tr'):
link=tr.select_one('td:nth-child(3) > a').get('href')
link="https://steamdb.info" link
name = tr.select_one('td:nth-child(3) > a').text
print(link)
print(name)
Output:
https://steamdb.info/app/730/graphs/
Counter-Strike: Global Offensive
https://steamdb.info/app/570/graphs/
Dota 2
https://steamdb.info/app/578080/graphs/
PUBG: BATTLEGROUNDS
https://steamdb.info/app/1172470/graphs/
Apex Legends
https://steamdb.info/app/1599340/graphs/
Lost Ark
https://steamdb.info/app/271590/graphs/
Grand Theft Auto V
https://steamdb.info/app/440/graphs/
Team Fortress 2
https://steamdb.info/app/1446780/graphs/
MONSTER HUNTER RISE
https://steamdb.info/app/346110/graphs/
ARK: Survival Evolved
https://steamdb.info/app/252490/graphs/
Rust
https://steamdb.info/app/431960/graphs/
Wallpaper Engine
https://steamdb.info/app/1506830/graphs/
FIFA 22
https://steamdb.info/app/1085660/graphs/
Destiny 2
https://steamdb.info/app/1569040/graphs/
Football Manager 2022
https://steamdb.info/app/230410/graphs/
Warframe
https://steamdb.info/app/1203220/graphs/
NARAKA: BLADEPOINT
https://steamdb.info/app/359550/graphs/
Tom Clancy's Rainbow Six Siege
https://steamdb.info/app/381210/graphs/
Dead by Daylight
https://steamdb.info/app/236390/graphs/
.. so on