scrape images url using beautifulsoup-CodePudding

I am trying to scrape images but they said me you you are probably treating a list of elements like a single element is there any solution for these the page link is https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}

url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'

# keeping it simple: download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script.  Firefox driver is available if you search for it
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('div.js-merch-stash-check-listing')
for links in tra:
    for link in links.find_all('a',href=True):
        comp=link['href']
        productlinks.append(comp)
        
        
for link in productlinks:
    r =requests.get(link,headers=headers)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('div.carousel-pagination-item-v2   div ul')
    for image in images.find_all('img',src=True):
        fiv=image['src']
        print(fiv)

CodePudding user response：

You code is facing two kinda errors.

You are iterating for image in images.find_all('img',src=True) and using image indicate a single element then use find_all() that's why you are getting such errors.
You are using image alternative attribute src which has no existance in the present selection. You have to use data-src-delay instead.

Working Code:

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
from selenium.webdriver.chrome.service import Service
url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'


webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get(url)
driver.maximize_window()
time.sleep(3)
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('ul[] li')
for link in tra:
    comp=link.a.get('href')
    productlinks.append(comp)
        
        
for link in productlinks:
    r =requests.get(link)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('div.carousel-pagination-item-v2   div ul li img')
    for image in images:
        fiv=image['data-src-delay']
        print(fiv)

Output:

https://i.etsystatic.com/16632652/r/il/479aba/3173560516/il_75x75.3173560516_aitm.jpg
https://i.etsystatic.com/16632652/r/il/f2d30c/3221273491/il_75x75.3221273491_nwmj.jpg
https://i.etsystatic.com/16632652/r/il/65484a/2296684633/il_75x75.2296684633_cl2w.jpg
https://i.etsystatic.com/16632652/r/il/151f9a/2249084734/il_75x75.2249084734_p8cz.jpg
https://i.etsystatic.com/16632652/r/il/eb3132/2249085196/il_75x75.2249085196_af65.jpg
https://i.etsystatic.com/16632652/r/il/44ff2d/2296685209/il_75x75.2296685209_rsav.jpg
https://i.etsystatic.com/16632652/r/il/add08a/2249085320/il_75x75.2249085320_lpki.jpg
https://i.etsystatic.com/16632652/r/il/96d498/2249086588/il_75x75.2249086588_t4pg.jpg
https://i.etsystatic.com/16632652/r/il/c23125/2249086106/il_75x75.2249086106_i7z8.jpg
https://i.etsystatic.com/16632652/r/il/65b42c/3221264087/il_75x75.3221264087_rgiv.jpg
https://i.etsystatic.com/27635917/r/il/148ecc/3608230113/il_75x75.3608230113_zxeh.jpg
https://v-cg.etsystatic.com/video/upload/ar_1:1,c_fill,h_105,q_auto,w_105/file_kwjggd.jpg
https://i.etsystatic.com/27635917/r/il/bc6d03/3560602896/il_75x75.3560602896_ojpr.jpg
https://i.etsystatic.com/27635917/r/il/de42d4/3608230241/il_75x75.3608230241_5nap.jpg
https://i.etsystatic.com/27635917/r/il/03f524/3560603336/il_75x75.3560603336_s9k2.jpg
https://i.etsystatic.com/27635917/r/il/8ec3f3/3608229833/il_75x75.3608229833_1ndd.jpg
https://i.etsystatic.com/27635917/r/il/68a897/3560603466/il_75x75.3560603466_98mk.jpg

... so on

CodePudding user response：

Main issue here is that you try to images.find_all('img',src=True) what would not work, while you have to select it in a more specific way, so you do not have to iterate (recommended):

images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
for image in images:
...

or iterate it to check each element of ResultSet:

images=soup.select('[data-carousel-pagination-list] li img')
for image in images:
    if image.get('data-src-delay'):
        fiv=image.get('data-src-delay')
        print(fiv)

How to fix?

...
driver.get(url)
time.sleep(2)

urls = [a.get('href') for a in BeautifulSoup(driver.page_source).select('a[data-palette-listing-image]')]

for url in urls:
    r =requests.get(url,headers=headers)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
    for image in images:
        fiv=image.get('data-src-delay')
        print(fiv)

CodePudding user response：

If still relevant then this is how you can get links without using bs4

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from time import sleep


options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
service = Service(executable_path='path_to_your_driver')
driver = webdriver.Chrome(service=service, options=options)


url = 'https://www.etsy.com/search?q=marokaanse azilal'

# open url
driver.get(url)

# accepted cookies
driver.find_element(By.CSS_SELECTOR, '.wt-btn.wt-btn--filled.wt-mb-xs-0').click()

# wait 3 seconds for all elements on the page to load
sleep(3)

# get all items on the page
items = driver.find_elements(By.CSS_SELECTOR, '.height-placeholder')

for item in items:
    # click on item
    item.click()
    # switch to open tab
    driver.switch_to.window(driver.window_handles[-1])
    # get number of images per page
    number_images = len(driver.find_elements(By.CSS_SELECTOR, 'li[data-carousel-pagination-item]'))
    for i in range(number_images):
        # get list of web elements of required images
        images = driver.find_elements(By.CSS_SELECTOR, 'li[data-palette-listing-image] > img')
        # print image link
        print(images[i].get_dom_attribute('src'))
        # click on the next button
        driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next image"]').click()
    # close tab
    driver.close()
    # switch to main tab
    driver.switch_to.window(driver.window_handles[0])

# close driver
driver.quit()