I have problems trying to scrape a web with multiple pages with Spyder: the web has 1 to 6 pages and also a next button. Also, each of one the six pages has 30 results. I've tried two solutions without success.
This is the first one:
#SOLUTION 1#
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1')
#Imports the HTML of the webpage into python
soup = BeautifulSoup(driver.page_source, 'lxml')
postings = soup.find_all('div', class_ = 'isp_grid_product')
#Creates data frame
df = pd.DataFrame({'Link':[''], 'Vendor':[''],'Title':[''], 'Price':['']})
#Scrape the data
for i in range (1,7): #I've also tried with range (1,6), but it gives 5 pages instead of 6.
url = "https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=" str(i) ""
postings = soup.find_all('li', class_ = 'isp_grid_product')
for post in postings:
link = post.find('a', class_ = 'isp_product_image_href').get('href')
link_full = 'https://store.unionlosangeles.com' link
vendor = post.find('div', class_ = 'isp_product_vendor').text.strip()
title = post.find('div', class_ = 'isp_product_title').text.strip()
price = post.find('div', class_ = 'isp_product_price_wrapper').text.strip()
df = df.append({'Link':link_full, 'Vendor':vendor,'Title':title, 'Price':price}, ignore_index = True)
The output of this code is a data frame with 180 rows (30 x 6), but it repeats the results of the first page. Thus, my first 30 rows are the first 30 results of the first page, and the rows 31-60 are again the same results of the first page and so on.
Here is the second solution I tried:
### SOLUTION 2 ###
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1')
#Imports the HTML of the webpage into python
soup = BeautifulSoup(driver.page_source, 'lxml')
soup
#Create data frame
df = pd.DataFrame({'Link':[''], 'Vendor':[''],'Title':[''], 'Price':['']})
#Scrape data
i = 0
while i < 6:
postings = soup.find_all('li', class_ = 'isp_grid_product')
len(postings)
for post in postings:
link = post.find('a', class_ = 'isp_product_image_href').get('href')
link_full = 'https://store.unionlosangeles.com' link
vendor = post.find('div', class_ = 'isp_product_vendor').text.strip()
title = post.find('div', class_ = 'isp_product_title').text.strip()
price = post.find('div', class_ = 'isp_product_price_wrapper').text.strip()
df = df.append({'Link':link_full, 'Vendor':vendor,'Title':title, 'Price':price}, ignore_index = True)
#Imports the next pages HTML into python
next_page = 'https://store.unionlosangeles.com' soup.find('div', class_ = 'page-item next').get('href')
page = requests.get(next_page)
soup = BeautifulSoup(page.text, 'lxml')
i = 1
The problem with this second solution is that the program cannot recognize the attribute "get" in next_page
, for reasons I cannot grasp (I haven't had this problem in other webs with paginations). Thus, I get only the first page and not the others.
How can I fix the code to properly scrape all 180 elements?
CodePudding user response:
The data you see is loaded from external URL via javascript. You can simulate these calls with requests
module. For example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
url = "https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1"
api_url = "https://cdn-gae-ssl-premium.akamaized.net/categories_navigation"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
params = {
"page_num": 1,
"store_id": "",
"UUID": "",
"sort_by": "creation_date",
"facets_required": "0",
"callback": "",
"related_search": "1",
"category_url": "/collections/outerwear",
}
q = parse_qs(
urlparse(soup.select_one("#isp_search_result_page ~ script")["src"]).query
)
params["store_id"] = q["store_id"][0]
params["UUID"] = q["UUID"][0]
all_data = []
for params["page_num"] in range(1, 7):
data = requests.get(api_url, params=params).json()
for i in data["items"]:
link = i["u"]
vendor = i["v"]
title = i["l"]
price = i["p"]
all_data.append([link, vendor, title, price])
df = pd.DataFrame(all_data, columns=["link", "vendor", "title", "price"])
print(df.head(10).to_markdown(index=False))
print("Total items =", len(df))
Prints:
link | vendor | title | price |
---|---|---|---|
/products/barn-jacket | Essentials | BARN JACKET | 250 |
/products/work-vest-2 | Essentials | WORK VEST | 120 |
/products/tailored-track-jacket | Martine Rose | TAILORED TRACK JACKET | 1206 |
/products/work-vest-1 | Essentials | WORK VEST | 120 |
/products/60-40-cloth-bug-anorak-1tone | Kapital | 60/40 Cloth BUG Anorak (1Tone) | 747 |
/products/smooth-jersey-stand-man-woman-track-jkt | Kapital | Smooth Jersey STAND MAN & WOMAN Track JKT | 423 |
/products/supersized-sports-jacket | Martine Rose | SUPERSIZED SPORTS JACKET | 1695 |
/products/pullover-vest | Nicholas Daley | PULLOVER VEST | 267 |
/products/flannel-polkadot-x-bandana-reversible-1st-jkt-1 | Kapital | FLANNEL POLKADOT X BANDANA REVERSIBLE 1ST JKT | 645 |
/products/60-40-cloth-bug-anorak-1tone-1 | Kapital | 60/40 Cloth BUG Anorak (1Tone) | 747 |
Total items = 175