Home > Software engineering >  How to switch from selenium to scrapy
How to switch from selenium to scrapy

Time:04-18

I have created a scraping tool using python selenium to collect data from aliexpress website. The issue is that the execution code takes a lot of time. i hae tried several tips to optimize selenium but in vain, I need to switch for scrapy but I am pretty new to use it. Would you please help me to find out a method to switch from selenium to scrapy without start from teh begining.

The script :

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from webdriver_manager.chrome import ChromeDriverManager
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import json 
import csv 


####################################### Scrap all product page ##################################
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress\msedgedriver") 
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 2):
    print('---', page_nb, '---')
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])

            stars = product.xpath('.//span[@]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            
            supl = product.xpath('.//a[@]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl)   str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

################################ Insert in database ################################
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data) 

################################### Get the list of documents ##################################
products = []
products = list(collection.find({}))

################################ Get list of urls #################################
for product in products :
    url = product["Productlinks"]
    driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress\msedgedriver") 
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    for a in driver.find_elements(By.XPATH, '//div[@]'):
        credibility = a.find_element(By.XPATH, './/div[@]').text
        print(credibility)
    sleep(.5)   
    driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
    driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
    
    feedback=[]
    while True:

        for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):

            try:
                country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
            except:
                country = None

            try:
                comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
            except:
                comment = None
            try:
                images = [i.get_attribute('src') for i in e.find_elements(By.CSS_SELECTOR, '.r-photo-list img')]
            except:
                images = []


            feedback.append({
                'country':country,
                'comment':comment,
                'images':images
               
            })
        try:
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
        except:
            break
    driver.close()
    collection.find_one_and_update( 
            {"_id" : product["_id"]},
            {"$set":
                {
                    "Feedback": feedback 
                }
            },

    upsert=False,
    array_filters = None
    )
    

I would be grateful if you help me. Thank you !

CodePudding user response:

You can start scrapping from the homepage of the website. With the help of scrapy, you can download home page content into a python variable, you can use beautifulsoup library to clean & organize the downloaded content, then python libraries & functions could be used to get all the relevant information from it. You can extract all the required URLs present on the home page from the downloaded content using regular expressions and pattern matching, you can use from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor for the same purpose.

CodePudding user response:

As the ule is dynamic So you can render it with selenium then grab data using bs4.

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page=1'     

driver = webdriver.Chrome(ChromeDriverManager().install())
                            
driver.get(url)
driver.maximize_window()

soup = BeautifulSoup(driver.page_source,'html.parser')
driver.close()

for card in soup.find_all('div', class_="_3GR-w"):
    title=card.select_one('._1tu1Z.Vgu6S h1').get_text() 
    print(title)
    star_rating=card.find('span',class_="eXPaM").get_text() if card.find('span',class_="eXPaM") else None
    print(star_rating)
    ship_cost=card.find('span',class_="_2jcMA").get_text() if card.find('span',class_="_2jcMA") else None
    print(ship_cost)
  
   
        
    

Output:

M9 TWS Gaming Earbuds Wireless Bluetooth Headphones 2000mAh Long Battery Life Earphones Smart 
Touch Headsets for iPhone Xiaomi
5
Free Shipping
Xiaomi E7S Bluetooth Earbuds Audiophile Headphones Stereo  Sports Listening To Music Gaming Volume Adjustment Ear Buds

... so on

  • Related