So I have to scrape all the products from this website's shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for
loop, but since here it's only one page I thought that isn't necessary.
Any idea why my code stops at the 12th product?
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)
root_folder = os.getcwd()
baseurl = 'https://bewellstore.ro/shop/'
# an array for all the product links
product_links = []
# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
# taking all the links to each product page
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
# appending the links previously taken to the array
print(product_links)
product_items_list = []
i = 0
d = {} # use as set()
os.chdir(folder)
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
# --- before `for`-loop ---
downloaded = []
# --- `for`-loop ---
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i = 1
name = str(i) 'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
# --- after `for`-loop ---
# storing all the infos about this product
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str img
else:
img_str = img_str img '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)
CodePudding user response:
That's because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium
to scroll the page.
But if you only want to use beautifulsoup
then there is a work around.
The URL for each page looks like this
https://bewellstore.ro/shop/page/<page_no>/
Example:
1st page: https://bewellstore.ro/shop/page/1/ 2nd page: https://bewellstore.ro/shop/page/2/
You could make a request to each of the above URLs and scrape your data using beautifulsoup
.
CodePudding user response:
You can try this for the all pages
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
import requests
product_links = []
headers = {
'authority': 'bewellstore.ro',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'sec-ch-ua-platform': '"Linux"',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://bewellstore.ro/shop/',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a={"session_id":"2e40c31b1503902767c5327edd3cf926","session_expiration":1637387542,"session_expiring":1637383942,"cookie_hash":"49a81940bd8d39b2f894021c16333e6f"}; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
}
while True:
response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
print(response.status_code)
print(response.url)
if response.status_code != 200:
break
soup = BeautifulSoup(response.content, 'html.parser')
product_list = soup.find_all('div', class_= 'loop-product-inner')
# print (product_list)
for item in product_list:
for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
product_links.append(link['href'])
print('Addedn link in product_links list :', link['href'])
product_items_list = []
i = 0
d = {}
for link_test in product_links:
r = requests.get(link_test)
soup = BeautifulSoup(r.content, 'lxml')
title = soup.find('h1', class_='product_title').text.strip()
price = soup.find('p', class_ = 'price').text.strip()
header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
sku = soup.find('span', class_ = 'sku').text.strip()
categories = soup.find('div' , class_ = 'posted_in').text.strip()
description = soup.find('div', class_ = 'cell large-6').text.strip()
brand = soup.find('div', class_ = 'tabs-panel').text.strip()
images = soup.select('.wp-post-image')
downloaded = []
for image in images:
link = image['src']
if link in d:
name = d[link]
downloaded.append(name)
else:
i = 1
name = str(i) 'img.jpg'
d[link] = name
print('link:', link)
print('name:', name)
print('---')
# here i am adding the .jpg and saving the images
with open(name, 'wb') as f:
im = requests.get(link)
#print("URMEAZA DEBUG: {}".format(im))
f.write(im.content)
downloaded.append(name)
img_str = ''
if len(downloaded) > 1:
for index, img in enumerate(downloaded):
if index == len(downloaded)-1:
img_str = img_str img
else:
img_str = img_str img '/'
else:
img_str = downloaded[0]
product = {
'sku': sku,
'base_image': img_str,
'small_image': img_str,
'thumbnail_image': img_str,
'additional_images': img_str,
'product_type': 'simple',
'attribute_set_code': 'Default',
'categories': categories.replace('Categorii: ','').replace(', ', '/'),
'name' : title,
'description': description,
'short_description': header,
'price' : price[0:5]
}
product_items_list.append(product)
page_number = 1
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)