How to scrap each product page (comments and custumer country)-CodePudding

I am trying to scrape each product page from aliexpress website in order to get number of comments, number of photos published by the custumer and also the custumer country and put it to a dataframe.

I have written a code that scrape custumer country but I don't know how to get the number of custumer comments and the number of images. This is my code :

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f={"sku_id":"12000027213624098"}&pdp_pi=-1;40.81;-1;-1@salePrice;MAD;search-mainSearch'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)

wait = WebDriverWait(driver, 10)

driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))

data=[]

while True:

    for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):

        try:
            country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
        except:
            country = None

       
        data.append({
            'country':country,
       
        })
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
    except:
        break

pd.DataFrame(data).to_csv('filename.csv',index=False)

I would appreciate any help from you! Thank you !

CodePudding user response：

If you want numbers of comments / reviews, you can just check the value in this section :

driver.find_element(By.XPATH, 'XPATH_OF_ELEMENT_TO_SCRAP')

To do so in your exemple lets do this outside your loop :

number_feedbacks = driver.find_element(By.XPATH, '//*[@id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[@id="transction-feedback"]//label[1]/em')

If you dont understand or know this function, please feal free to ask and I will explain where I found theses XPATH.We also can use find by id function.

In your code it would be :

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f={"sku_id":"12000027213624098"}&pdp_pi=-1;40.81;-1;-1@salePrice;MAD;search-mainSearch'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)

wait = WebDriverWait(driver, 10)

driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))

data=[]

number_feedbacks = driver.find_element(By.XPATH, '//*[@id="transction-feedback"]/div[1]')
number_images = driver.find_element(By.XPATH, '//*[@id="transction-feedback"]//label[1]/em')

print(f'number_feedbacks = {number_feedbacks}\nnumber_images = {number_images}')

while True:

    for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):

        try:
            country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
        except:
            country = None

       
        data.append({
            'country':country,
       
        })
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
    except:
        break

pd.DataFrame(data).to_csv('filename.csv',index=False)