I'm trying to scrap product data from AliExpress website using selenium, I already did scrap title, price and rating for a whole catégorie but I don't know how can I scrap number of comments and pictures of each product from this catégorie "smart lock".
this is my code I used to scrap title, proce and rating from the url of smart lock products:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import time
def scrap(subject):
start_time = time.time()
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome('C:/Users/ADMIN/Desktop/chromedriver.exe',chrome_options=options)
url ='https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart lock<ype=wholesale&SortType=total_tranpro_desc'
client = MongoClient("mongodb://localhost:27017/")
collection_name = subject
collection = client["db2"][collection_name]
x = collection.delete_many({})
for page_nb in range(1, 6):
print('---', page_nb, '---')
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
leng = len(price)
price = [x.text for x in price]
currency = price[leng-1]
price = ''.join(price[0:leng-1])
stars = product.xpath('.//span[@]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
row = [title, price, stars]
results.append(row)
df = pd.DataFrame(results , columns=("Title","Price", "Stars"))
data = df.to_dict(orient = 'records')
collection.insert_many(data)
for example I want to scrap those two elements :
CodePudding user response:
As mentioned in my comment above try to scrape all links on this site:
urlList = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]
Than iterate this list and open each detail page:
for url in urlList[:2]:
driver.get(url)
Tricky part here, you have to scroll down until you detect an iframe
that holds the feedback. Extract the src
and scrape this ressource:
...
try:
iframe = driver.find_element(By.CSS_SELECTOR, '#feedback iframe').get_attribute('src')
except:
iframe = None
if new_offset <= current_offset or iframe:
break
current_offset = new_offset
driver.get(iframe)
print(int(''.join(i for i in driver.find_element(By.CSS_SELECTOR,'#transction-feedback > div').text if i.isdigit())))
print(driver.find_element(By.CSS_SELECTOR,'#cb-withPictures-filter em').text)
Example
Note: For demo I sliced urlList[:2]
to get only two iteration, but you could simply delete [:2]
to get all urls from the list
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart lock<ype=wholesale&SortType=total_tranpro_desc'
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)
driver.get(url)
urlList = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]
for url in urlList[:2]:
driver.get(url)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
try:
iframe = driver.find_element(By.CSS_SELECTOR, '#feedback iframe').get_attribute('src')
except:
iframe = None
if new_offset <= current_offset or iframe:
break
current_offset = new_offset
driver.get(iframe)
print(int(''.join(i for i in driver.find_element(By.CSS_SELECTOR,'#transction-feedback > div').text if i.isdigit())))
print(driver.find_element(By.CSS_SELECTOR,'#cb-withPictures-filter em').text)