How do I scrape nested images from featherlight page from website?-CodePudding

I am trying to scrape nested images and text aside it from featherlight. I tried following code but it returns empty.Example of nested images to be scraped, not sure exactly what I am missing or doing wrong. Thank you for your help in advance.

trial_list = ["https://www.novitecgroup.com/en/brands/ferrari/roma/","https://www.novitecgroup.com/en/brands/ferrari/f8/f8-spider/"]
for i in trial_list:
    driver.get(i)
#
# codelines
#
    parts_lists = wait.until(EC.visibility_of_all_elements_located(
        (By.XPATH, "//div[@class='tuning-parts-categories__content']/div[1]//li/a[1]")))
    for x in parts_lists:
        driver.execute_script("arguments[0].scrollIntoView();", x)
        time.sleep(2)
        driver.execute_script("arguments[0].click();", x)
        try:  # app_carpage > div.featherlight > div > div > div > h2
            featherlight = WebDriverWait(driver, 5).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, "div.tuning-part-popup__colors")))
            product_imgs = featherlight.find_elements(By.XPATH, ".//img")
            for product_img in product_imgs:
                temp_img_src = product_img.get_attribute('src')
            print(temp_img_src)
                # print(type(product))
        except Exception as e:
            print("image Not Avaliable")
            # collected_data_product.append(product)
            pass
        close = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.featherlight-close-icon.featherlight-close")))
        driver.execute_script("arguments[0].click();", close)

CodePudding user response：

Not exactly answering your question but can you not parse this data out of the post requests to these urls:

https://www.novitecgroup.com/en/brands/ferrari/roma/loadTuningPartCategory?id=1
https://www.novitecgroup.com/en/brands/ferrari/roma/loadTuningPartCategory?id=2
https://www.novitecgroup.com/en/brands/ferrari/roma/loadTuningPartCategory?id=3
https://www.novitecgroup.com/en/brands/ferrari/roma/loadTuningPartCategory?id=4

The links in these urls can be navigated to and used to get the images you are after, without having to use Selenium. You can then go to the linked colour data you want

This script gets the data you are after:

import requests
from bs4 import BeautifulSoup
import pandas as pd

s = requests.Session()

headers =   {
    'accept':'*/*',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    }

cars = ['ferrari/roma/','ferrari/f8/f8-spider/']

final = []
for car in cars:

    for i in range(1,11):

        url = f'https://www.novitecgroup.com/en/brands/{car}loadTuningPartCategory?id={i}'

        resp = s.post(url,headers=headers)

        if len(resp.text) < 1:
            continue

        soup = BeautifulSoup(resp.text,'html.parser')

        part = soup.find('h3').text.strip().replace('&amp;','&')

        for li in soup.find_all('li'):
            a1 = li.find_all('a')[0]
            link = 'https://www.novitecgroup.com'  a1['data-featherlight']
            data_id = a1['data-id']
            img_src = 'https://www.novitecgroup.com' a1.find('img')['data-src']
            big_img_src = 'https://www.novitecgroup.com' a1.find('img')['data-srcset'].split(' ')[0]

            a2 = li.find_all('a')[1]
            text = a2.text.strip()
            subtext = li.find('span').text.strip()

            deeper = s.get(link)
            new_soup = BeautifulSoup(deeper.text,'html.parser')

            try:
                container = new_soup.find('ul',{'class':'tuning-part-popup__colors'})

                for color in container.find_all('li'):
                    color_img = 'https://www.novitecgroup.com' color.find('img')['src']
                    color_code = color.find('img')['alt']
                    color_name = color.find('span').find('strong').text.strip()
                    color_article_no = color.find('span').text.strip().replace(color_name,'').replace('\t','').replace('\n','')

            except:
                color_img = 'none'
                color_code = 'none'
                color_name = 'none'
                color_article_no = 'none'

            print(part,text,subtext,link,img_src,big_img_src,color_code,color_img)

            item = {
                'car_link': url,
                'part_link': link,
                'part': part,
                'text': text,
                'subtext': subtext,
                'img_src': img_src,
                'big_img_src': big_img_src,
                'color_img':color_img,
                'color_code':color_code,
                'color_name':color_name,
                'color_article_no':color_article_no
            }

            final.append(item)

df= pd.DataFrame(final)
df.to_csv('car_parts.csv',index=False)