Home > Net >  I can't get reviews of each product page in Aliexpress
I can't get reviews of each product page in Aliexpress

Time:03-15

I'm trying to scrap data from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth earphones Especially I want to get all reviews from each product page. The main issue is that I'm struggling to get this surrounded bottom in order to scrape each comment and customer country: Here is a photo showing that: enter image description here

This is my code :

from selenium import webdriver  
from lxml import html 
import cssselect
from time import sleep
from itertools import zip_longest
import csv

driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
    
with open ("data.csv", "w", encoding="utf-8") as csvfile:
    wr = csv.writer(csvfile)
    wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
    for page_nb in range(1, 4):
        print('---', page_nb, '---')
        driver.get(url.format(page_nb))
        sleep(2)
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  
            new_offset = driver.execute_script("return window.pageYOffset;")
            print(new_offset,current_offset)
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        sleep(3)
        tree = html.fromstring(driver.page_source)
        results = []
        for product in tree.xpath('//div[@]//a'):
            title = product.xpath('.//h1/text()')
            if title:
                title = title[0]
                price = product.cssselect('div.mGXnE._37W_B span')
                price = [x.text for x in price]
                currency = price[0]
                price = ''.join(price[1:])
                review = product.xpath('.//span[@]/text()')
                if review:
                    review = review[0]
                else:
                    review = ''
                nb_sold = product.xpath('.//span[@]/text()')
                if nb_sold:
                    nb_sold = nb_sold[0]
                else:
                    nb_sold = ''
                ship_cost = product.xpath('.//span[@]/text()')
                if ship_cost:
                    ship_cost = ship_cost[0]
                else:
                    ship_cost = ''
                    
###########################################                
                links = product.xpath('//div[@]//a/@href')
                if links:
                    links = links[0]
                else:
                    links = ''
                    
# scraping data from each inner page     
                    
                for link in links :
                    driver.get(link)
                    sleep(2)
                    current_offset = 0
                    while True:
                        driver.execute_script("window.scrollBy(0, window.innerHeight);")
                        sleep(.5) 
                        new_offset = driver.execute_script("return window.pageYOffset;")
                        print(new_offset,current_offset)
                        if new_offset <= current_offset:
                            break
                        current_offset = new_offset 
        
                    sleep(3)
                    tree = html.fromstring(driver.page_source)
                    for cmt in tree.xpath('//*[@id="transction-feedback"]/div[5]/div[1]'):
                        country = cmt.xpath('.//div[@]//b/text()')
                        if country:
                            country = country[0]
                        else:
                            country = ''
                            
                        comment = cmt.xpath('.//span[@id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
                        if comment:
                            comment = comment[0]
                        else:
                            comment = ''
                
                row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
                results.append(row)
                
        print('len(results):', len(results))
        wr.writerows(results)

driver.close()

CodePudding user response:

There are two problems:

First:

You have to use html.fromstring(driver.page_source) AFTER you scroll down.

Second:

It adds items only when they are displayed inside window (in viewport) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight.

current_offset = 0

while True:
    driver.execute_script("window.scrollBy(0, window.innerHeight);")
    sleep(.5)  # JavaScript has time to add elements

    new_offset = driver.execute_script("return window.pageYOffset;")
    #print(new_offset,current_offset)
    if new_offset <= current_offset:
        break

    current_offset = new_offset

Full working code with other changes in xpath.

It gives me 60 items on every page.

from selenium import webdriver  
from lxml import html 
from time import sleep
from itertools import zip_longest
import csv

driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()

url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
    
with open ("data.csv", "w", encoding="utf-8") as csvfile:

    wr = csv.writer(csvfile)
    wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])

    for page_nb in range(1, 4):
        print('---', page_nb, '---')
        
        driver.get(url.format(page_nb))
        sleep(2)

        # jump to the end of page        
        #driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')

        # scroll partially
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  # JavaScript has time to add elements
            new_offset = driver.execute_script("return window.pageYOffset;")
            print(new_offset,current_offset)
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        
        sleep(3)
        
        tree = html.fromstring(driver.page_source)
        
        results = []
        
        for product in tree.xpath('//div[@]//a'):
            title = product.xpath('.//h1/text()')
            #print('[DEBUG] title:', title)
            
            if title:
                title = title[0]
                #print('[DEBUG] title:', title)
                
                price = product.cssselect('div.mGXnE._37W_B span')
                price = [x.text for x in price]

                # for `$ 35.00`
                currency = price[0]
                price = ''.join(price[1:])
                
                # for `35.00 zł`
                #currency = price[-1]
                #price = ''.join(price[:-1])
                
                #print('[DEBUG] price:', price)
                #print('[DEBUG] currency:', currency)

                review = product.xpath('.//span[@]/text()')
                if review:
                    review = review[0]
                else:
                    review = ''
                #print('[DEBUG] review:', review)
                    
                nb_sold = product.xpath('.//span[@]/text()')
                if nb_sold:
                    nb_sold = nb_sold[0]
                else:
                    nb_sold = ''
                #print('[DEBUG] nb_sold:', nb_sold)
                    
                row = [title, price, currency, review, nb_sold]
                results.append(row)
                #print('[DEBUG] row:', row)
            
        print('len(results):', len(results))
        wr.writerows(results)  

driver.close()
  • Related