Home > Back-end >  Scrap number of comments and number of pictures from Aliexpress using selenium
Scrap number of comments and number of pictures from Aliexpress using selenium

Time:09-06

I'm trying to scrap product data from AliExpress website using selenium, I already did scrap title, price and rating for a whole catégorie but I don't know how can I scrap number of comments and pictures of each product from this catégorie "smart lock".

this is my code I used to scrap title, proce and rating from the url of smart lock products:

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


def scrap(subject):
    start_time = time.time()      
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors-spki-list')
    options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome('C:/Users/ADMIN/Desktop/chromedriver.exe',chrome_options=options)
    url ='https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart lock&ltype=wholesale&SortType=total_tranpro_desc'
    
    client = MongoClient("mongodb://localhost:27017/")    
    collection_name = subject
    collection = client["db2"][collection_name] 
    x = collection.delete_many({})   
    for page_nb in range(1, 6):
        
        print('---', page_nb, '---')    
        wait = WebDriverWait(driver, 10)
        actions = ActionChains(driver)
        driver.get(url.format(page_nb))           
        sleep(2)
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  # JavaScript has time to add elements
            new_offset = driver.execute_script("return window.pageYOffset;")
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        sleep(3)
        tree = html.fromstring(driver.page_source)
        results = []
        for product in tree.xpath('//div[@]//a'):
            title = product.xpath('.//h1/text()')
            if title:
                title = title[0]
                price = product.cssselect('div.mGXnE._37W_B span')
                leng = len(price)
                price = [x.text for x in price]

                currency = price[leng-1]

                price = ''.join(price[0:leng-1])

                stars = product.xpath('.//span[@]/text()')
                if stars :
                    stars  = stars [0]
                else:
                    stars  = 'None'
               
                row = [title, price, stars]
                results.append(row)
      
        df = pd.DataFrame(results , columns=("Title","Price", "Stars"))
        
        data = df.to_dict(orient = 'records')     
        collection.insert_many(data)

for example I want to scrap those two elements :

enter image description here

CodePudding user response:

As mentioned in my comment above try to scrape all links on this site:

urlList = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]

Than iterate this list and open each detail page:

for url in urlList[:2]:
    driver.get(url)

Tricky part here, you have to scroll down until you detect an iframe that holds the feedback. Extract the src and scrape this ressource:

...
try:
    iframe = driver.find_element(By.CSS_SELECTOR, '#feedback iframe').get_attribute('src')
except:
    iframe = None
if new_offset <= current_offset or iframe:
    break
current_offset = new_offset

driver.get(iframe)
print(int(''.join(i for i in driver.find_element(By.CSS_SELECTOR,'#transction-feedback > div').text if i.isdigit())))
print(driver.find_element(By.CSS_SELECTOR,'#cb-withPictures-filter  em').text)

Example

Note: For demo I sliced urlList[:2] to get only two iteration, but you could simply delete [:2] to get all urls from the list

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart lock&ltype=wholesale&SortType=total_tranpro_desc'
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)

driver.get(url)

urlList = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]

for url in urlList[:2]:
    driver.get(url)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")

        try:
            iframe = driver.find_element(By.CSS_SELECTOR, '#feedback iframe').get_attribute('src')
        except:
            iframe = None
        if new_offset <= current_offset or iframe:
            break
        current_offset = new_offset

    driver.get(iframe)
    print(int(''.join(i for i in driver.find_element(By.CSS_SELECTOR,'#transction-feedback > div').text if i.isdigit())))
    print(driver.find_element(By.CSS_SELECTOR,'#cb-withPictures-filter  em').text)
  • Related