I can't get all product from page website ( scraping aliexpress using pyhon selenium)-CodePudding

I am trying to web scrape Aliexpress using Selenium and Python, and everything seems to be okay. The issue I have ran into is that my code scrape only and exactly 10 products from each page instead of all the products, and I still don't know the reason behind it. Also, I want a method to scrape the links of each product in order to extract reviews and comments. Here is my code :

from selenium import webdriver  
from lxml import html 
from time import sleep
from itertools import zip_longest
import csv

driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
    
with open ("data.csv", "w", encoding="utf-8") as csvfile:
    wr = csv.writer(csvfile)
    wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
    for page_nb in range (1,4):
            driver.get('https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'.format(page_nb))
            sleep(1)
            tree = html.fromstring(driver.page_source)
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(10)
            for product_tree in tree.xpath('/html/body/div[3]/div/div/div[2]/div[2]/div/div[2]'):
                title = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[1]/h1/text()')
                price = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[2]/span[2]/text()')
                currency = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[2]/span[1]/text()')
                review = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[4]/span[2]/text()')
                nb_sold = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[4]/span[1]/text()')
                list = [title, price, currency, review, nb_sold]
            
            exported = zip_longest(*list)
            wr.writerows(exported)  
            print(list)     
                    
driver.close()

Thank you !

CodePudding user response：

There are two problems:

First:

You have to use html.fromstring(driver.page_source) after you scroll down.

Second:

It adds items only when they are displayed inside window (in viewport) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight.

current_offset = 0

while True:
    driver.execute_script("window.scrollBy(0, window.innerHeight);")
    sleep(.5)  # JavaScript has time to add elements

    new_offset = driver.execute_script("return window.pageYOffset;")
    #print(new_offset,current_offset)
    if new_offset <= current_offset:
        break

    current_offset = new_offset

Full working code with other changes in xpath.

It gives me 60 items on every page.

from selenium import webdriver  
from lxml import html 
from time import sleep
from itertools import zip_longest
import csv

driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()

url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
    
with open ("data.csv", "w", encoding="utf-8") as csvfile:

    wr = csv.writer(csvfile)
    wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])

    for page_nb in range(1, 4):
        print('---', page_nb, '---')
        
        driver.get(url.format(page_nb))
        sleep(2)

        # jump to the end of page        
        #driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')

        # scroll partially
        current_offset = 0
        while True:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
            sleep(.5)  # JavaScript has time to add elements
            new_offset = driver.execute_script("return window.pageYOffset;")
            print(new_offset,current_offset)
            if new_offset <= current_offset:
                break
            current_offset = new_offset
        
        sleep(3)
        
        tree = html.fromstring(driver.page_source)
        
        results = []
        
        for product in tree.xpath('//div[@]//a'):
            title = product.xpath('.//h1/text()')
            #print('[DEBUG] title:', title)
            
            if title:
                title = title[0]
                #print('[DEBUG] title:', title)
                
                price = product.cssselect('div.mGXnE._37W_B span')
                price = [x.text for x in price]

                # for `$ 35.00`
                currency = price[0]
                price = ''.join(price[1:])
                
                # for `35.00 zł`
                #currency = price[-1]
                #price = ''.join(price[:-1])
                
                #print('[DEBUG] price:', price)
                #print('[DEBUG] currency:', currency)

                review = product.xpath('.//span[@]/text()')
                if review:
                    review = review[0]
                else:
                    review = ''
                #print('[DEBUG] review:', review)
                    
                nb_sold = product.xpath('.//span[@]/text()')
                if nb_sold:
                    nb_sold = nb_sold[0]
                else:
                    nb_sold = ''
                #print('[DEBUG] nb_sold:', nb_sold)
                    
                row = [title, price, currency, review, nb_sold]
                results.append(row)
                #print('[DEBUG] row:', row)
            
        print('len(results):', len(results))
        wr.writerows(results)  

driver.close()

CodePudding user response：

You need to minimize the use of Xpath to as minimum as possible.

Try to find elements by CSS syntax, for example:

This will find a div element that has the word 123 in its class attribute

"div[class*='123']"

This will find an element via Xpath that has to word Hello inside its text:

"*[text(contains(),'Hello')]"

You should avoid long Xpath root because it's not readable and takes longer to execute. After you practice some CSS and Xpath selectors it will be easier for you to find elements, click them or read the data inside them.

Also sleep of only 1 second may not be effective in the future. The site might response with some latency and you will see what you find after 3 seconds or more. Read about the WebDriverWait element that knows how to wait up to a specific threshold and waiting conditions