I'm trying to get all the category, sub category and sub sub category of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Top/Arts/Literature/Authors/C
/Top/Arts/Literature/Authors/E
/Top/Arts/Literature/Authors/G
/Top/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
CodePudding user response:
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
CodePudding user response:
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' str(author_url.get('href'))
print(author_url)