I'm trying to scrape one web page using Selenium chrome driver in headless mode but it gives me error also very slow.
When I disable headless mode it works well very faster!
My code :
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup, Tag
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import ssl
import time
chrome_options = Options()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--start-maximized")
chrome_options.headless = True
driver = webdriver.Chrome(executable_path='/Users/sarathc/Desktop/costco/chromedriver', options=chrome_options)
def listResponse(url):
driver.get(url)
time.sleep(0.2)
soup = BeautifulSoup(driver.page_source,"html.parser")
return soup
soup = listResponse("https://www.costco.com.au/Smart-TVs-Audio-Cameras/c/cos_21")
cat = soup.find_all("div", {"class": ["category-node ng-star-inserted"]})
for sk in cat:
print(sk.find("a").get("href"))
Error :
AttributeError: 'NoneType' object has no attribute 'get'
How i can run this code in headless mode without error and much faster like without headless mode ?
CodePudding user response:
In some cases you need to add an User-Agent to get the page source in headless mode.
Code snippet:-
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("user-agent=Chrome/80.0.3987.132")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)
def listResponse(url):
driver.get(url)
time.sleep(0.2)
soup = BeautifulSoup(driver.page_source,"html.parser")
return soup
soup = listResponse("https://www.costco.com.au/Smart-TVs-Audio-Cameras/c/cos_21")
cat = soup.find_all("div", {"class": ["category-node ng-star-inserted"]})
for sk in cat:
print(sk.find("a").get("href"))
And also, you don't have to add chrome_options.add_argument("--start-maximized")
when you've already specified the window-size.