I am trying to use a rotating proxy here in this script. But I don't have a proper idea of how to use it. I have checked out the previous issues regarding this and tried to implement it. But it detects the proxy, asks for login, and prevents getting data. I have developed the below-mentioned script using selenium selenium-stealth. I also tried with crawl spider but got the same result.
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium_stealth import stealth
import time
class RsSpider(scrapy.Spider):
name = 'rs'
allowed_domains = ['www.sahibinden.com']
def start_requests(self):
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get("https://www.sahibinden.com/satilik/istanbul-eyupsultan?pagingOffset=0")
time.sleep(5)
links = driver.find_elements(By.XPATH, "//td[@class='searchResultsTitleValue ']/a")
for link in links:
href= link.get_attribute('href')
yield SeleniumRequest(
url = href,
callback= self.parse,
meta={'proxy': 'username:password@server:2000'},
wait_time=1
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'URL': response.url,
'City': response.xpath("normalize-space(//div[@class='classifiedInfo
']/h2/a[1]/text())").get(),
}
CodePudding user response:
If adding proxy to request parameters does not work then
#1
You can add a proxy middleware pipeline
and add that to the project setting. (better, safer option)
Here is a working code for the middleware -
from w3lib.http import basic_auth_header
from scrapy.utils.project import get_project_settings
class ProxyMiddleware(object):
def process_request(self, request, spider):
settings = get_project_settings()
request.meta['proxy'] = settings.get('PROXY_HOST') ':' settings.get('PROXY_PORT')
request.headers["Proxy-Authorization"] = basic_auth_header(settings.get('PROXY_USER'), settings.get('PROXY_PASSWORD'))
spider.log('Proxy : %s' % request.meta['proxy'])
settings file (activate DOWNLOADER_MIDDLEWARES
) -
import os
from dotenv import load_dotenv
load_dotenv()
....
....
# Proxy setup
PROXY_HOST = os.environ.get("PROXY_HOST")
PROXY_PORT = os.environ.get("PROXY_PORT")
PROXY_USER = os.environ.get("PROXY_USER")
PROXY_PASSWORD = os.environ.get("PROXY_PASSWORD")
.....
.....
.....
DOWNLOADER_MIDDLEWARES = {
# 'project.middlewares.projectDownloaderMiddleware': 543,
'project.proxy_middlewares.ProxyMiddleware': 350,
}
.env
file -
PROXY_HOST=127.0.0.1
PROXY_PORT=6666
PROXY_USER=proxy_user
PROXY_PASSWORD=proxy_password
#2
Have a look at this middleware - scrapy-rotating-proxies