Home > Back-end >  Word search with BeautifulSoup
Word search with BeautifulSoup

Time:10-05

I'm trying to scrape this news website "https://inshorts.com/en/read/national" and I'm fetching results of articles with heads Headline and news. I need all the articles on the pages which contain a specific word (e.g., "health") and add "date" on the head.

Here's my code:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# code for scraping the first page
d={'headlines':[],'news':[], 'date':[]}
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
soup=soup.findAll("div",{"class":"news-card z-depth-1"})

#to search specific word in the content
soup = soup.find_all(text=re.compile("Health"))

for data in soup:
    d['headlines'].append(data.find(itemprop="headline").getText())
    d['news'].append(data.find(itemprop="articleBody").getText())
    d['date'].append(data.find(itemprop="date").getText())

# code for scraping more pages
for i in tqdm(range(10)):
# It uses JavaScript to load more data from
# https://inshorts.com/en/ajax/more_news using POST requests
# with parameter 'news_offset' which informs server what page
# it has to send to client.
# we can make POST requests with this parameter to get new
# data in JSON format
    try:
        params = {'news_offset': min_news_id}
        req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
#In JSON you have HTML in json_data['html'] and
#json_data['min_news_id'] for next page

        json_data = req.json()
        min_news_id = json_data['min_news_id']
        soup = BeautifulSoup(json_data['html'], 'html.parser')
        soup=soup.findAll("div",{"class":"news-card z-depth-1"})
        for data in soup:
            d['headlines'].append(data.find(itemprop="headline").getText())
            d['news'].append(data.find(itemprop="articleBody").getText())
            d['date'].append(data.find(itemprop="date").getText())

    except:
        pass
# storing the data into .csv file
df = pd.DataFrame(d)
df.to_csv("inshorts_news.csv", index=False)

And here's the error:

AttributeError                            Traceback (most recent call last)
<ipython-input-2-2d109f9dfc91> in <module>()
     12 
     13 #to search specific word in the content
---> 14 soup = soup.find_all(text=re.compile("Health"))
     15 
     16 for data in soup:

/usr/local/lib/python3.7/dist-packages/bs4/element.py in __getattr__(self, key)
   1882     def __getattr__(self, key):
   1883         raise AttributeError(
-> 1884             "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
   1885         )

AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?

CodePudding user response:

What happens?

As the error tells you are trying to find_all() on a ResultSet object, that wont work.

How to fix?

Iterate over the elements of the object and check there for your keyword:

for data in soup.select('div.news-card.z-depth-1'):
    if data.find(text=re.compile("farmer")):

Example

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
# code for scraping the first page
d=[]
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]

# code for scraping more pages
for i in tqdm(range(2)):

    try:
        params = {'news_offset': min_news_id}
        req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)

        json_data = req.json()
        min_news_id = json_data['min_news_id']
        soup = BeautifulSoup(json_data['html'], 'html.parser')
        for data in soup.select('div.news-card.z-depth-1'):
            if data.find(text=re.compile("farmer")):
                d.append({
                    'headline': data.find(itemprop="headline").getText(),
                    'article': data.find(itemprop="articleBody").getText()
                })
                
    except Exception as e:
        print (e)
        
pd.DataFrame(d)

Output

    headline                                            article
0   Heavy traffic seen on DND Flyway at Noida toll...   Heavy traffic was witnessed on Delhi Noida Dir...
1   Farmers take out protest march in Haryana over...   Farmers have taken out a protest march in Hary...
2   Akhilesh Yadav detained in Lucknow after sit-i...   Samajwadi Party President Akhilesh Yadav was d...
3   Priyanka detained on way to UP's Lakhimpur Khe...   Congress leader Priyanka Gandhi Vadra was deta...
4   Rakesh Tikait reaches UP's Lakhimpur Kheri aft...   BKU leader Rakesh Tikait reached UP's Lakhimpu...
5   Opposition to start with 'Photo Ops' in Lakhim...   Uttar Pradesh Cabinet Minister Sidharth Nath S...
  • Related