I'm trying to scrap data from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth earphones Especially I want to get all reviews from each product page. The main issue is that I'm struggling to get this surrounded bottom in order to scrape each comment and customer country: Here is a photo showing that: enter image description here
This is my code :
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[@]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[@]//a/@href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[@id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[@]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[@id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
CodePudding user response:
There are two problems:
First:
You have to use html.fromstring(driver.page_source)
AFTER you scroll down.
Second:
It adds items only when they are displayed inside window (in viewport
) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight
.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
Full working code with other changes in xpath
.
It gives me 60 items on every page.
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[@]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()