Home > OS >  Web Scraping TimeOutException: Message:
Web Scraping TimeOutException: Message:

Time:08-27

I am trying to scrape an ecommerce website. I would like to scrape the product description of every product from the search results. I successfully scrape all the product links from the search results and get the product description of one product. However, when I try to loop the product links to get the product description from all of the products that I get from the search results, the TimeOutException: Message is coming up.

I already try to change the time of the WebDriverWait and it doesn't fix the error.

Any idea what should I do?

Here is my code:

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter
import json
from turtle import delay
import time

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')      

# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)

baseurl = 'https://shopee.co.id/search?keyword=obat kanker'

product_links = []

for page in range(0,6):
    search_link = 'https://shopee.co.id/search?keyword=obat kanker&page={}'.format(page)
    driver.get(search_link)
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

    driver.execute_script("""
            var scroll = document.body.scrollHeight / 10;
            var i = 0;
            function scrollit(i) {
            window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
            i  ;
            if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
            }
            scrollit(i);
            """)
    sleep(5)
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = BeautifulSoup(html, "html.parser")

    product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
    for item in product_list:
        for link in item.find_all('a', href=True):
            product_links.append(baseurl   link['href'])



for link in product_links:
        driver.get(link)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))

        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i  ;
                if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
                }
                scrollit(i);
                """)

        sleep(20)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        name = soup.find('div', class_='_2rQP1z').text.replace('Star ','')
        price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
        sold = soup.find('div', class_ = 'HmRxgn').text.strip()
        rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
        city = soup.find('span', class_ = '_2fJrvA').text.strip()
        specification = soup.find('div', class_ = '_2jz573').text.strip()

        herbcancer = {
                'name': name,
                'price': price,
                'sold': sold,
                'rate': rate,
                'city': city,
                'specification': specification
        }

        print(herbcancer)

CodePudding user response:

Base url is incorrect that why they show you TimeOutException:

https://shopee.co.id/search?keyword=obat kanker

The correct base url is:

https://shopee.co.id

Complete Code is :

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter
import json
from turtle import delay
import time

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')      

# create webdriver object
path = ''
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)

baseurl = 'https://shopee.co.id'

product_links = []

for page in range(0,6):
    search_link = 'https://shopee.co.id/search?keyword=obat kanker&page={}'.format(page)
    driver.get(search_link)
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

    driver.execute_script("""
            var scroll = document.body.scrollHeight / 10;
            var i = 0;
            function scrollit(i) {
            window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
            i  ;
            if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
            }
            scrollit(i);
            """)
    sleep(5)
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = BeautifulSoup(html, "html.parser")

    product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
    for item in product_list:
        for link in item.find_all('a', href=True):
            comp=baseurl   link['href']
            product_links.append(comp)
            
            
    for link in product_links:
        driver.get(link)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))

        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i  ;
                if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
                }
                scrollit(i);
                """)

        sleep(3)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        name = soup.find('div', class_='_2rQP1z').text.replace('Star ','')
        price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
        sold = soup.find('div', class_ = 'HmRxgn').text.strip()
        rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
        
        try:
            city = soup.find('span', class_ = '_2fJrvA').text.strip()
        except:
            city=''
            
        try:
            specification = soup.find('div', class_ = '_2jz573').text.strip()
        except:
            specification=''

        herbcancer = {
                'name': name,
                'price': price,
                'sold': sold,
                'rate': rate,
                'city': city,
                'specification': specification
        }

        print(herbcancer)
  • Related