How to scrape data from each product page from Aliexpress using python selenium-CodePudding

I am trying to scrape each product page from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20220315022920&SearchText=bluetooth earphones

Especially I want to get comments and custumer countries as I mentionned in the photo: enter image description here

The main issue is that my code does not inspect the right elements and this is what I am struggling with . First, I tried my scraping on this product : https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f={"sku_id":"12000027213624098"}&pdp_pi=-1;40.81;-1;-1@salePrice;MAD;search-mainSearch

Here is my code :

from selenium import webdriver
from selenium.webdriver.common.by import By  
from lxml import html 
import cssselect
from time import sleep
from itertools import zip_longest
import csv

driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = "https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f={"sku_id":"12000027213624098"}&pdp_pi=-1;40.81;-1;-1@salePrice;MAD;search-mainSearch"

with open ("data.csv", "w", encoding="utf-8") as csvfile:
    wr = csv.writer(csvfile)
    wr.writerow(["Comment","Custumer country"])

driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')


review_buttom = driver.find_element_by_xpath('//li[@ae_button_type="tab_feedback"]')
review_buttom.click()


html_source = driver.find_element_by_xpath('//div[@id="transction-feedback"]')
tree = html.fromstring(html_source)

#tree = html.fromstring(driver.page_source)
for rvw in tree.xpath('//div[@]'):
    country = rvw.xpath('//div[@]//b/text()')
    if country:
        country = country[0]
    else:
        country = ''
    print('country:', country)
    
    comment = rvw.xpath('//dt[@id="buyer-feedback"]//span/text()')
    if comment:
        comment = comment[0]
    else:
        comment = ''
    print('comment:', comment)
    
driver.close()

Thank you !!

CodePudding user response：

What happens?

There is one main issue, the feedback you are looking for is in an iframe, so you wont get your information by calling the elements directly.

How to fix?

Scroll into view of element that holds the iframe navigate to its source and interact with its pagination to get all the feedbacks.

Example

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f={"sku_id":"12000027213624098"}&pdp_pi=-1;40.81;-1;-1@salePrice;MAD;search-mainSearch'

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 10)

driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))

data=[]

while True:

    for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):

        try:
            country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
        except:
            country = None

        try:
            comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
        except:
            comment = None

        data.append({
            'country':country,
            'comment':comment
        })
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
    except:
        break

pd.DataFrame(data).to_csv('filename.csv',index=False)