I am giving the url as input : url = "https://www.amazon.in/s?k=headphones&page=1" This works fine but stops at page 19 Instead of we breaking at page 19, I want to give the next input as "https://www.amazon.in/s?k="
- "speakers&page=1"
- "earbuds&page=1" and so on to run in a loop
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title
})
return a_soup
def getnextpage(a_soup):
page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
page = page['href']
url = 'http://www.amazon.in' str(page)
return url
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)```
```output = pd.DataFrame(data)
output
This code is returning the correct results, but instead of me giving a new url every time I want it to input a list of items which can be added at the end of the url one at a time to fetch the results which can be added to the DataFrame Note: The search results stop at 19th page
CodePudding user response:
Make a list for your keywords, iterate it and include the while loop into each iteration.
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k=' k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Be aware that amazon does not like such automated access to its pages and recognises the patterns of access quite quickly. To reduce the frequency of the requests a bit, you should at least include some delay time.sleep()
. Of course, it would be even better to use an official api.
Example
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title,
'url':'http://www.amazon.in' e.h2.a['href']
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in' str(page)
except:
url = None
return url
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k=' k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Output (print)
http://www.amazon.in/s?k=speakers&page=2&qid=1649420352&ref=sr_pg_1
...
http://www.amazon.in/s?k=speakers&page=20&qid=1649420373&ref=sr_pg_19
http://www.amazon.in/s?k=earbuds&page=2&qid=1649420375&ref=sr_pg_1
...
http://www.amazon.in/s?k=earbuds&page=20&qid=1649420394&ref=sr_pg_19