Home > OS >  Why does scraper stop at the 12th image?
Why does scraper stop at the 12th image?

Time:10-21

So I have to scrape all the products from this website's shop ( https://bewellstore.ro/shop/), but my code stops at the 12th photo. I have made a version for websites with multiple shop pages where I take them all in a for loop, but since here it's only one page I thought that isn't necessary. Any idea why my code stops at the 12th product?



import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

folder = 'beWell_images'
os.makedirs(folder, exist_ok=True)

root_folder = os.getcwd()

baseurl = 'https://bewellstore.ro/shop/'

# an array for all the product links
product_links = [] 

# going through all the pages of the shop
url = 'https://bewellstore.ro/shop/'

r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')

product_list = soup.find_all('div', class_= 'loop-product-inner')
print (product_list)
    # taking all the links to each product page
for item in product_list:
    for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
        product_links.append(link['href'])
            # appending the links previously taken to the array
print(product_links)
product_items_list = []

i = 0
d = {}  # use as set() 

os.chdir(folder)

for link_test in product_links:

    r = requests.get(link_test)
    soup = BeautifulSoup(r.content, 'lxml')
    
    title = soup.find('h1', class_='product_title').text.strip()
    price = soup.find('p', class_ = 'price').text.strip()
    header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
    sku  = soup.find('span', class_ = 'sku').text.strip()
    categories = soup.find('div' , class_ = 'posted_in').text.strip()
    description = soup.find('div', class_ = 'cell large-6').text.strip()
    brand = soup.find('div', class_ = 'tabs-panel').text.strip()

    images = soup.select('.wp-post-image')
    
    # --- before `for`-loop ---
    
    downloaded = []
    
    # --- `for`-loop ---
    for image in images:
        link = image['src']
        if link in d:
            name = d[link]
            downloaded.append(name)
        else:
            i  = 1
            name = str(i)  'img.jpg'
            d[link] = name
            print('link:', link)
            print('name:', name)
            print('---')
            # here i am adding the .jpg and saving the images
            with open(name, 'wb') as f:
                im = requests.get(link)
                #print("URMEAZA DEBUG: {}".format(im))
                f.write(im.content)
            downloaded.append(name)
            
    # --- after `for`-loop ---
    
    # storing all the infos about this product 

    img_str = ''
    if len(downloaded) > 1:
        for index, img in enumerate(downloaded):
            if index == len(downloaded)-1:
                img_str = img_str   img
            else:
                img_str = img_str   img   '/'
    else: 
       img_str = downloaded[0] 


    product = {
        'sku': sku,
        'base_image': img_str,
        'small_image': img_str,
        'thumbnail_image': img_str,
        'additional_images': img_str,
        'product_type': 'simple',
        'attribute_set_code': 'Default',
        'categories': categories.replace('Categorii: ','').replace(', ', '/'),
        'name' : title,
        'description': description,
        'short_description': header,
        'price' : price[0:5]
    }
    product_items_list.append(product)
    
os.chdir(root_folder)
# os.chdir('output')
df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)



CodePudding user response:

That's because this webpage uses pagination (with 12 products per page) and each page gets loaded only when you scroll. You will have to use selenium to scroll the page.

But if you only want to use beautifulsoup then there is a work around.

  • The URL for each page looks like this

    https://bewellstore.ro/shop/page/<page_no>/
    
  • Example:

    1st page: https://bewellstore.ro/shop/page/1/
    2nd page: https://bewellstore.ro/shop/page/2/
    
You could make a request to each of the above URLs and scrape your data using beautifulsoup.

CodePudding user response:

You can try this for the all pages

import requests
from bs4 import BeautifulSoup
import csv 
from datetime import datetime

results = []

page_number = 1
import requests
product_links = [] 
headers = {
    'authority': 'bewellstore.ro',
    'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
    'sec-ch-ua-platform': '"Linux"',
    'accept': '*/*',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://bewellstore.ro/shop/',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'fp_session=new; mc_landing_site=https://bewellstore.ro/shop/; _omappvp=i5rIyW2xsMFKIu3uhQtmFj1TN9jw7aKjO8dgy3SVvWMhAj30NvKFrBXfJLe3dQK6ZdbB4FezbrwFWPGLdKrsj1A1vqN2PRLI; _omappvs=1634795539874; _clck=1f7zptk|1|evr|0; _ga=GA1.2.2117949575.1634795541; _gid=GA1.2.1155690725.1634795541; _fbp=fb.1.1634795541140.1266696245; PHPSESSID=94b6b1996b0b5e831d898c192b4bca06; _clsk=2489zg|1634795542054|1|1|e.clarity.ms/collect; yith_wcwl_session_d235bd7d63b3a120c05ba3c90256789a={"session_id":"2e40c31b1503902767c5327edd3cf926","session_expiration":1637387542,"session_expiring":1637383942,"cookie_hash":"49a81940bd8d39b2f894021c16333e6f"}; omSeen-dwf9rgtvzzrhqylccaag=1634795583943; om-dwf9rgtvzzrhqylccaag=1634795585931; _omra={"dwf9rgtvzzrhqylccaag":"click"}; cookie_notice_accepted=true; ls_smartpush=fdfbe0ffe7800007',
}

while True:
    response = requests.get(f'https://bewellstore.ro/shop/page/{page_number}/', headers=headers)
    print(response.status_code)
    print(response.url)
    if response.status_code != 200:
        break
    soup = BeautifulSoup(response.content, 'html.parser')
    product_list = soup.find_all('div', class_= 'loop-product-inner')
    # print (product_list)  

    for item in product_list:
        for link in item.find_all('a', href=True, class_='woocommerce-LoopProduct-link woocommerce-loop-product__link'):
            product_links.append(link['href'])
            print('Addedn link in product_links list :', link['href'])

    product_items_list = []
    i = 0
    d = {}

    for link_test in product_links:

        r = requests.get(link_test)
        soup = BeautifulSoup(r.content, 'lxml')

        title = soup.find('h1', class_='product_title').text.strip()
        price = soup.find('p', class_ = 'price').text.strip()
        header = soup.find('div', class_ = 'woocommerce-product-details__short-description').text.strip()
        sku  = soup.find('span', class_ = 'sku').text.strip()
        categories = soup.find('div' , class_ = 'posted_in').text.strip()
        description = soup.find('div', class_ = 'cell large-6').text.strip()
        brand = soup.find('div', class_ = 'tabs-panel').text.strip()

        images = soup.select('.wp-post-image')

        downloaded = []

        for image in images:
            link = image['src']
            if link in d:
                name = d[link]
                downloaded.append(name)
            else:
                i  = 1
                name = str(i)  'img.jpg'
                d[link] = name
                print('link:', link)
                print('name:', name)
                print('---')
            # here i am adding the .jpg and saving the images
            with open(name, 'wb') as f:
                im = requests.get(link)
                #print("URMEAZA DEBUG: {}".format(im))
                f.write(im.content)
                downloaded.append(name)


        img_str = ''
        if len(downloaded) > 1:
            for index, img in enumerate(downloaded):
                if index == len(downloaded)-1:
                    img_str = img_str   img
                else:
                    img_str = img_str   img   '/'
        else: 
            img_str = downloaded[0] 


        product = {
        'sku': sku,
        'base_image': img_str,
        'small_image': img_str,
        'thumbnail_image': img_str,
        'additional_images': img_str,
        'product_type': 'simple',
        'attribute_set_code': 'Default',
        'categories': categories.replace('Categorii: ','').replace(', ', '/'),
        'name' : title,
        'description': description,
        'short_description': header,
        'price' : price[0:5]
        }
        product_items_list.append(product)
                    

    page_number  = 1


df = pd.DataFrame(product_items_list)
print(df)
df.to_csv('beWell.csv', index=False)
  • Related