I am trying to scrape an ecommerce website. I would like to scrape the product description of every product from the search results. I successfully scrape all the product links from the search results and get the product description of one product. However, when I try to loop the product links to get the product description from all of the products that I get from the search results, the TimeOutException: Message is coming up.
I already try to change the time of the WebDriverWait and it doesn't fix the error.
Any idea what should I do?
Here is my code:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import json
from turtle import delay
import time
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')
# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
baseurl = 'https://shopee.co.id/search?keyword=obat kanker'
product_links = []
for page in range(0,6):
search_link = 'https://shopee.co.id/search?keyword=obat kanker&page={}'.format(page)
driver.get(search_link)
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i ;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
for item in product_list:
for link in item.find_all('a', href=True):
product_links.append(baseurl link['href'])
for link in product_links:
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i ;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(20)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
name = soup.find('div', class_='_2rQP1z').text.replace('Star ','')
price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
sold = soup.find('div', class_ = 'HmRxgn').text.strip()
rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
city = soup.find('span', class_ = '_2fJrvA').text.strip()
specification = soup.find('div', class_ = '_2jz573').text.strip()
herbcancer = {
'name': name,
'price': price,
'sold': sold,
'rate': rate,
'city': city,
'specification': specification
}
print(herbcancer)
CodePudding user response:
Base url is incorrect that why they show you TimeOutException
:
https://shopee.co.id/search?keyword=obat kanker
The correct base url
is:
https://shopee.co.id
Complete Code is :
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import json
from turtle import delay
import time
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')
# create webdriver object
path = ''
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
baseurl = 'https://shopee.co.id'
product_links = []
for page in range(0,6):
search_link = 'https://shopee.co.id/search?keyword=obat kanker&page={}'.format(page)
driver.get(search_link)
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i ;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
for item in product_list:
for link in item.find_all('a', href=True):
comp=baseurl link['href']
product_links.append(comp)
for link in product_links:
driver.get(link)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i ;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(3)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
name = soup.find('div', class_='_2rQP1z').text.replace('Star ','')
price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
sold = soup.find('div', class_ = 'HmRxgn').text.strip()
rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
try:
city = soup.find('span', class_ = '_2fJrvA').text.strip()
except:
city=''
try:
specification = soup.find('div', class_ = '_2jz573').text.strip()
except:
specification=''
herbcancer = {
'name': name,
'price': price,
'sold': sold,
'rate': rate,
'city': city,
'specification': specification
}
print(herbcancer)