Home > Net >  Python web scrapping empty result
Python web scrapping empty result

Time:09-22

I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result

    import pandas as pd
    import requests
    from bs4 import BeautifulSoup as bs

    all_books = []
    
    url = "http://books.toscrape.com/catalogue/page-1.html"
    def get_page(url):
        page = requests.get(url)
        status = page.status_code
        soup = bs(page.text, "lxml")
        return [soup, status]

def get_links(soup):
    links = []
    listings = soup.find_all(class_="product_pod")

def get_links(soup):
    links = []
    listings = soup.find_all(class_="product_pod")

    def extract_info(links):
        for listing in listings:
            bk_lnk = listing.find("h5").a.get("href")
            base_url = "http://books.toscrape.com/catalogue"
            cmplt_lnk = base_url   bk_lnk
            links.append(cmplt_lnk)
        return links

def extract_info(links):
    for link in links:
        res = requests.get(link).text
        book_soup = bs(res, "lxml")
        title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
        price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
        book = {"title": title, "price": price}
        all_books.append(book)


        pg = 1
        while True:
            url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
            soup_status = get_page(url)
            if soup_status[1] == 200:
                print (f"scraping page {pg}")
                extract_info(get_links(soup_status[0]))
                pg  = 1
            else: 
                print("The End")
                break

df = pd.DataFrame(all_books)

print (df)

here's the result am getting

Empty DataFrame
Columns: []
Index: []

my colab notebook link

https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing

CodePudding user response:

Your list is empty . Need to call your functions .. such as

Get_page(url) which should return a list which you can use soup in your subsequent function ..

CodePudding user response:

def get_links(soup):
    links = []
    listings = soup.find_all(class_="product_pod")

    def extract_links():
        for listing in listings:
           bk_lnk = listing.find("h3").a.get("href")
           base_url = "https://books.toscrape.com/catalogue/"
           cmplt_lnk = base_url   bk_lnk
           links.append(cmplt_lnk)
        return links
    return extract_links()

def extract_info(links):
    for link in links:
       res = requests.get(link).text
       book_soup = bs(res, "lxml")
       title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
       price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
       book = {"title": title, "price": price}
       all_books.append(book)

pg = 45

while True:
    url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
    soup_status = get_page(url)
    if soup_status[1] == 200:
        print (f"scraping page {pg}")
        extract_info(get_links(soup_status[0]))
        pg  = 1
    else: 
        print("The End")
        break
  • Related