Home > OS >  scrape images url using beautifulsoup
scrape images url using beautifulsoup

Time:09-22

I am trying to scrape images but they said me you you are probably treating a list of elements like a single element is there any solution for these the page link is https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}

url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'

# keeping it simple: download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script.  Firefox driver is available if you search for it
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('div.js-merch-stash-check-listing')
for links in tra:
    for link in links.find_all('a',href=True):
        comp=link['href']
        productlinks.append(comp)
        
        
for link in productlinks:
    r =requests.get(link,headers=headers)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('div.carousel-pagination-item-v2   div ul')
    for image in images.find_all('img',src=True):
        fiv=image['src']
        print(fiv)

CodePudding user response:

You code is facing two kinda errors.

  1. You are iterating for image in images.find_all('img',src=True) and using image indicate a single element then use find_all() that's why you are getting such errors.

  2. You are using image alternative attribute src which has no existance in the present selection. You have to use data-src-delay instead.

Working Code:

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
from selenium.webdriver.chrome.service import Service
url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'


webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get(url)
driver.maximize_window()
time.sleep(3)
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('ul[] li')
for link in tra:
    comp=link.a.get('href')
    productlinks.append(comp)
        
        
for link in productlinks:
    r =requests.get(link)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('div.carousel-pagination-item-v2   div ul li img')
    for image in images:
        fiv=image['data-src-delay']
        print(fiv)

Output:

https://i.etsystatic.com/16632652/r/il/479aba/3173560516/il_75x75.3173560516_aitm.jpg
https://i.etsystatic.com/16632652/r/il/f2d30c/3221273491/il_75x75.3221273491_nwmj.jpg
https://i.etsystatic.com/16632652/r/il/65484a/2296684633/il_75x75.2296684633_cl2w.jpg
https://i.etsystatic.com/16632652/r/il/151f9a/2249084734/il_75x75.2249084734_p8cz.jpg
https://i.etsystatic.com/16632652/r/il/eb3132/2249085196/il_75x75.2249085196_af65.jpg
https://i.etsystatic.com/16632652/r/il/44ff2d/2296685209/il_75x75.2296685209_rsav.jpg
https://i.etsystatic.com/16632652/r/il/add08a/2249085320/il_75x75.2249085320_lpki.jpg
https://i.etsystatic.com/16632652/r/il/96d498/2249086588/il_75x75.2249086588_t4pg.jpg
https://i.etsystatic.com/16632652/r/il/c23125/2249086106/il_75x75.2249086106_i7z8.jpg
https://i.etsystatic.com/16632652/r/il/65b42c/3221264087/il_75x75.3221264087_rgiv.jpg
https://i.etsystatic.com/27635917/r/il/148ecc/3608230113/il_75x75.3608230113_zxeh.jpg
https://v-cg.etsystatic.com/video/upload/ar_1:1,c_fill,h_105,q_auto,w_105/file_kwjggd.jpg
https://i.etsystatic.com/27635917/r/il/bc6d03/3560602896/il_75x75.3560602896_ojpr.jpg
https://i.etsystatic.com/27635917/r/il/de42d4/3608230241/il_75x75.3608230241_5nap.jpg
https://i.etsystatic.com/27635917/r/il/03f524/3560603336/il_75x75.3560603336_s9k2.jpg
https://i.etsystatic.com/27635917/r/il/8ec3f3/3608229833/il_75x75.3608229833_1ndd.jpg
https://i.etsystatic.com/27635917/r/il/68a897/3560603466/il_75x75.3560603466_98mk.jpg

... so on

CodePudding user response:

Main issue here is that you try to images.find_all('img',src=True) what would not work, while you have to select it in a more specific way, so you do not have to iterate (recommended):

images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
for image in images:
...

or iterate it to check each element of ResultSet:

images=soup.select('[data-carousel-pagination-list] li img')
for image in images:
    if image.get('data-src-delay'):
        fiv=image.get('data-src-delay')
        print(fiv)

How to fix?

...
driver.get(url)
time.sleep(2)

urls = [a.get('href') for a in BeautifulSoup(driver.page_source).select('a[data-palette-listing-image]')]

for url in urls:
    r =requests.get(url,headers=headers)
    soup=BeautifulSoup(r.content, 'html.parser')
    images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
    for image in images:
        fiv=image.get('data-src-delay')
        print(fiv)

CodePudding user response:

If still relevant then this is how you can get links without using bs4

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from time import sleep


options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
service = Service(executable_path='path_to_your_driver')
driver = webdriver.Chrome(service=service, options=options)


url = 'https://www.etsy.com/search?q=marokaanse azilal'

# open url
driver.get(url)

# accepted cookies
driver.find_element(By.CSS_SELECTOR, '.wt-btn.wt-btn--filled.wt-mb-xs-0').click()

# wait 3 seconds for all elements on the page to load
sleep(3)

# get all items on the page
items = driver.find_elements(By.CSS_SELECTOR, '.height-placeholder')

for item in items:
    # click on item
    item.click()
    # switch to open tab
    driver.switch_to.window(driver.window_handles[-1])
    # get number of images per page
    number_images = len(driver.find_elements(By.CSS_SELECTOR, 'li[data-carousel-pagination-item]'))
    for i in range(number_images):
        # get list of web elements of required images
        images = driver.find_elements(By.CSS_SELECTOR, 'li[data-palette-listing-image] > img')
        # print image link
        print(images[i].get_dom_attribute('src'))
        # click on the next button
        driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next image"]').click()
    # close tab
    driver.close()
    # switch to main tab
    driver.switch_to.window(driver.window_handles[0])

# close driver
driver.quit()
  • Related