I am trying to web scrape the links of products in a category of
https://www.acihellas.gr/gaming-pontikia#/
It has 4 pages of products..but for some reason I get only the first one.. with the following
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
filterprods = '/#/pageSize=21&viewMode=grid&orderBy=10&pageNumber='
for itm in range(1,page_number):
print("Page",itm)
urlget = str(url2get filterprods str(itm))
time.sleep(2)
ses=requests.Session()
r = ses.get(urlget, headers=headers)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'lxml')
productlist = soup.find_all('div',attrs = {'class','item-box'})
for p_item in productlist:
a = p_item.find('a')
if a:
producttitle = a['title']
productlink = a['href']
url_item = 'https://acihellas.gr' productlink
print(url_item)
urllist.append(url_item)
time.sleep(2)
else:
pass
ses.close()
else:
print(r.status_code)
return urllist
The links are joined correctly but ses.get(url) is not working so I thought if I maybe close session again nothing.
The page is not having when inspecting a link to next page. So I constructed with filterprods variable
How can we fix this?
thank you
CodePudding user response:
You didn't provided an URL in your code to website. You can use website API to gather products. Here is a starting code, I leave parsing to you :)
import requests
url = 'https://www.acihellas.gr/getFilteredProducts'
for pagenum in range(1, 5):
payload = {
"categoryId": "828",
"manufacturerId": "0",
"vendorId": "0",
"priceRangeFilterModel7Spikes": "null",
"specificationFiltersModel7Spikes": {
"CategoryId": "828",
"ManufacturerId": "0",
"VendorId": "0",
"SpecificationFilterGroups": [{
"Id": 998,
"FilterItems": [{
"Id": "25188",
"FilterItemState": "Unchecked"
}, {
"Id": "18572",
"FilterItemState": "Unchecked"
}, {
"Id": "7361",
"FilterItemState": "Unchecked"
}, {
"Id": "7362",
"FilterItemState": "Unchecked"
}, {
"Id": "7368",
"FilterItemState": "Unchecked"
}, {
"Id": "18060",
"FilterItemState": "Unchecked"
}, {
"Id": "19024",
"FilterItemState": "Unchecked"
}, {
"Id": "24876",
"FilterItemState": "Unchecked"
}, {
"Id": "28037",
"FilterItemState": "Unchecked"
}, {
"Id": "23321",
"FilterItemState": "Unchecked"
}]
}, {
"Id": 990,
"FilterItems": [{
"Id": "7336",
"FilterItemState": "Unchecked"
}]
}, {
"Id": 995,
"FilterItems": [{
"Id": "7350",
"FilterItemState": "Unchecked"
}, {
"Id": "7348",
"FilterItemState": "Unchecked"
}, {
"Id": "7349",
"FilterItemState": "Unchecked"
}]
}]
},
"pageNumber": str(pagenum),
"orderby": "10",
"viewmode": "grid",
"pagesize": "21",
"queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber=" str(pagenum),
"shouldNotStartFromFirstPage": "true",
"keyword": "",
"searchCategoryId": "0",
"searchManufacturerId": "0",
"searchVendorId": "0",
"priceFrom": "",
"priceTo": "",
"includeSubcategories": "False",
"searchInProductDescriptions": "False",
"advancedSearch": "False",
"isOnSearchPage": "False",
"inStockFilterModel": {
"CategoryId": "828",
"ManufacturerId": "0",
"VendorId": "0",
"Id": "1",
"FilterItemState": "Unchecked"
}
}
res = requests.post(url, json=payload)
print(res.text)
Note: Your links are relative, so you need to prepend them with website URL: https://www.acihellas.gr/
EDIT:
Answering a question if there is a way to only change category in payload , it looks like yes, I removed entire section from payload
variable which was basicly product filters, and It still works:
payload = {
"categoryId": "828",
"manufacturerId": "0",
"vendorId": "0",
"priceRangeFilterModel7Spikes": "null",
"pageNumber": str(pagenum),
"orderby": "10",
"viewmode": "grid",
"pagesize": "21",
"queryString": "#/pageSize=21&viewMode=grid&orderBy=10&pageNumber=" str(pagenum),
"shouldNotStartFromFirstPage": "true",
"keyword": "",
"searchCategoryId": "0",
"searchManufacturerId": "0",
"searchVendorId": "0",
"priceFrom": "",
"priceTo": "",
"includeSubcategories": "False",
"searchInProductDescriptions": "False",
"advancedSearch": "False",
"isOnSearchPage": "False",
"inStockFilterModel": {
"CategoryId": "828",
"ManufacturerId": "0",
"VendorId": "0",
"Id": "1",
"FilterItemState": "Unchecked"
}
}
CodePudding user response:
I cannot accurately test this because this domain is blocked in my country, but maybe you could try something like this:
import requests
import bs4 as bs
url_base = 'http://www.acihellas.gr/gaming-pontikia#/pageSize=21&viewMode=grid&orderBy=10&pageNumber={page}'
total_pages = 4
products = {}
for page in range(1, total_pages 1):
url = url_base.format(page=page)
print(f'Scraping page {page}...')
res = requests.get(url)
soup = bs.BeautifulSoup(res.text, 'lxml')
item = soup.find_all('div', {'class': 'details'})
# Add items to dictionary
for i in item:
name = i.find("h2").text
print(name)
url = i.find("a")['href']
products[i] = {'name': name, 'url': url}
products
The products
dictionary should have the name and url of items.