I am trying to scrape images but they said me you you are probably treating a list of elements like a single element
is there any solution for these the page link is https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'
# keeping it simple: download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script. Firefox driver is available if you search for it
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('div.js-merch-stash-check-listing')
for links in tra:
for link in links.find_all('a',href=True):
comp=link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
images=soup.select('div.carousel-pagination-item-v2 div ul')
for image in images.find_all('img',src=True):
fiv=image['src']
print(fiv)
CodePudding user response:
You code is facing two kinda errors.
You are iterating
for image in images.find_all('img',src=True)
and using image indicate a single element then use find_all() that's why you are getting such errors.You are using image alternative attribute
src
which has no existance in the present selection. You have to usedata-src-delay
instead.
Working Code:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
from selenium.webdriver.chrome.service import Service
url = 'https://www.etsy.com/search/handmade?q=marokaanse azilal vloerkleden&explicit=1&item_type=handmade&ship_to=NL&page=1&ref=pagination'
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get(url)
driver.maximize_window()
time.sleep(3)
productlinks=[]
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.select('ul[] li')
for link in tra:
comp=link.a.get('href')
productlinks.append(comp)
for link in productlinks:
r =requests.get(link)
soup=BeautifulSoup(r.content, 'html.parser')
images=soup.select('div.carousel-pagination-item-v2 div ul li img')
for image in images:
fiv=image['data-src-delay']
print(fiv)
Output:
https://i.etsystatic.com/16632652/r/il/479aba/3173560516/il_75x75.3173560516_aitm.jpg
https://i.etsystatic.com/16632652/r/il/f2d30c/3221273491/il_75x75.3221273491_nwmj.jpg
https://i.etsystatic.com/16632652/r/il/65484a/2296684633/il_75x75.2296684633_cl2w.jpg
https://i.etsystatic.com/16632652/r/il/151f9a/2249084734/il_75x75.2249084734_p8cz.jpg
https://i.etsystatic.com/16632652/r/il/eb3132/2249085196/il_75x75.2249085196_af65.jpg
https://i.etsystatic.com/16632652/r/il/44ff2d/2296685209/il_75x75.2296685209_rsav.jpg
https://i.etsystatic.com/16632652/r/il/add08a/2249085320/il_75x75.2249085320_lpki.jpg
https://i.etsystatic.com/16632652/r/il/96d498/2249086588/il_75x75.2249086588_t4pg.jpg
https://i.etsystatic.com/16632652/r/il/c23125/2249086106/il_75x75.2249086106_i7z8.jpg
https://i.etsystatic.com/16632652/r/il/65b42c/3221264087/il_75x75.3221264087_rgiv.jpg
https://i.etsystatic.com/27635917/r/il/148ecc/3608230113/il_75x75.3608230113_zxeh.jpg
https://v-cg.etsystatic.com/video/upload/ar_1:1,c_fill,h_105,q_auto,w_105/file_kwjggd.jpg
https://i.etsystatic.com/27635917/r/il/bc6d03/3560602896/il_75x75.3560602896_ojpr.jpg
https://i.etsystatic.com/27635917/r/il/de42d4/3608230241/il_75x75.3608230241_5nap.jpg
https://i.etsystatic.com/27635917/r/il/03f524/3560603336/il_75x75.3560603336_s9k2.jpg
https://i.etsystatic.com/27635917/r/il/8ec3f3/3608229833/il_75x75.3608229833_1ndd.jpg
https://i.etsystatic.com/27635917/r/il/68a897/3560603466/il_75x75.3560603466_98mk.jpg
... so on
CodePudding user response:
Main issue here is that you try to images.find_all('img',src=True)
what would not work, while you have to select it in a more specific way, so you do not have to iterate (recommended):
images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
for image in images:
...
or iterate it to check each element of ResultSet
:
images=soup.select('[data-carousel-pagination-list] li img')
for image in images:
if image.get('data-src-delay'):
fiv=image.get('data-src-delay')
print(fiv)
How to fix?
...
driver.get(url)
time.sleep(2)
urls = [a.get('href') for a in BeautifulSoup(driver.page_source).select('a[data-palette-listing-image]')]
for url in urls:
r =requests.get(url,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
images=soup.select('[data-carousel-pagination-list] li img[data-src-delay]')
for image in images:
fiv=image.get('data-src-delay')
print(fiv)
CodePudding user response:
If still relevant then this is how you can get links without using bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from time import sleep
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
service = Service(executable_path='path_to_your_driver')
driver = webdriver.Chrome(service=service, options=options)
url = 'https://www.etsy.com/search?q=marokaanse azilal'
# open url
driver.get(url)
# accepted cookies
driver.find_element(By.CSS_SELECTOR, '.wt-btn.wt-btn--filled.wt-mb-xs-0').click()
# wait 3 seconds for all elements on the page to load
sleep(3)
# get all items on the page
items = driver.find_elements(By.CSS_SELECTOR, '.height-placeholder')
for item in items:
# click on item
item.click()
# switch to open tab
driver.switch_to.window(driver.window_handles[-1])
# get number of images per page
number_images = len(driver.find_elements(By.CSS_SELECTOR, 'li[data-carousel-pagination-item]'))
for i in range(number_images):
# get list of web elements of required images
images = driver.find_elements(By.CSS_SELECTOR, 'li[data-palette-listing-image] > img')
# print image link
print(images[i].get_dom_attribute('src'))
# click on the next button
driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Next image"]').click()
# close tab
driver.close()
# switch to main tab
driver.switch_to.window(driver.window_handles[0])
# close driver
driver.quit()