I am trying to web scrape Aliexpress using Selenium and Python, and everything seems to be okay. The issue I have ran into is that my code scrape only and exactly 10 products from each page instead of all the products, and I still don't know the reason behind it. Also, I want a method to scrape the links of each product in order to extract reviews and comments. Here is my code :
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range (1,4):
driver.get('https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'.format(page_nb))
sleep(1)
tree = html.fromstring(driver.page_source)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(10)
for product_tree in tree.xpath('/html/body/div[3]/div/div/div[2]/div[2]/div/div[2]'):
title = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[1]/h1/text()')
price = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[2]/span[2]/text()')
currency = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[2]/span[1]/text()')
review = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[4]/span[2]/text()')
nb_sold = product_tree.xpath('//*[@id="root"]/div/div/div[2]/div[2]/div/div[2]//div[2]/div[4]/span[1]/text()')
list = [title, price, currency, review, nb_sold]
exported = zip_longest(*list)
wr.writerows(exported)
print(list)
driver.close()
Thank you !
CodePudding user response:
There are two problems:
First:
You have to use html.fromstring(driver.page_source)
after you scroll down.
Second:
It adds items only when they are displayed inside window (in viewport
) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight
.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
Full working code with other changes in xpath
.
It gives me 60 items on every page.
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[@]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
CodePudding user response:
You need to minimize the use of Xpath
to as minimum as possible.
Try to find elements by CSS syntax, for example:
This will find a div element that has the word 123 in its class attribute
"div[class*='123']"
This will find an element via Xpath that has to word Hello inside its text:
"*[text(contains(),'Hello')]"
You should avoid long Xpath root because it's not readable and takes longer to execute. After you practice some CSS and Xpath selectors it will be easier for you to find elements, click them or read the data inside them.
Also sleep of only 1 second may not be effective in the future. The site might response with some latency and you will see what you find after 3 seconds or more. Read about the WebDriverWait
element that knows how to wait up to a specific threshold and waiting conditions