How to find all category and subcategory URL using Beautifuloup-CodePudding

I'm trying to get all the category, sub category and sub sub category of authors URL from dmoz website using BeautifulSoup.

I'm getting the following output:

# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Top/Arts/Literature/Authors/C
/Top/Arts/Literature/Authors/E
/Top/Arts/Literature/Authors/G
/Top/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo

In the above code 2nd element is missing in 1st step and 1st element in 2nd step.

Here is my code:

scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]

# get all the root category author list
for test in find_row:
    if test.find('div', attrs = {'class':'panel-body'}):
        test_link = test.find_all('a')
        for link in test_link:
            sub_cat.append(link['href'])

# now get the sub or sub-sub category author URL list
for cat in sub_cat:
    scrape_cat_url = "http://dmoz.org%s" % (cat)
    print('scraping...', scrape_cat_url)
    page = session.get(scrape_cat_url, headers={
    "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    })
    soup = bs(page.text, 'html.parser')

    find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
    # if sub category go next level or restart
    for row in find_row:
        if row.find('div', attrs = {'class':'panel-body'}):
            test_link = row.find_all('a')
            for link in test_link:
                sub_cat.append(link['href'])
            records.append(scrape_cat_url)
        
        else:
            records.append(scrape_cat_url)
    # remove the category url from the sub_cat list
    sub_cat.remove(cat)

Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?

CodePudding user response：

Try this streamlined version of your code:

from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]

cats = []

for row in find_rows:
    links = row.find_all('a')
    for link in links:
        cats.append(link['href'])

cats

Print out:

['/Top/Arts/Literature/Authors/A',
 '/Top/Arts/Literature/Authors/B',
 '/Top/Arts/Literature/Authors/C',
 '/Top/Arts/Literature/Authors/D',
 '/Top/Arts/Literature/Authors/E',
 '/Top/Arts/Literature/Authors/F',
 …

Now get the subcategories:

sub_cats = []

for cat in cats:
    scrape_url = f"http://dmozlive.com{cat}"
    page = requests.get(scrape_url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
    
    for row in find_rows:
        links = row.find_all('a')
        for link in links:
            sub_cats.append(link['href'])

subcats

Print out:

['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
 '/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
 '/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
 '/Top/Arts/Literature/Authors/A/Abe,_Kobo',
 '/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
 '/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
 '/Top/Arts/Literature/Authors/A/Adams,_Douglas',
 …

CodePudding user response：

The following code may meet your expectation that's pull all the categories and sub-categories urls.

import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)

soup = BeautifulSoup(req.text, 'html.parser')

for cat_url in soup.select('.list-group.col-md-6 a'):
    cat_url = 'http://dmozlive.com'   cat_url.get('href')
    #print(cat_url)
    req2=requests.get(cat_url,headers=headers)
    soup2 = BeautifulSoup(req2.text, 'html.parser')
    for author_url in soup2.select('.list-group-item'):
        author_url= 'http://dmozlive.com'   str(author_url.get('href'))
        print(author_url)