I'm trying to scrape this news website "https://inshorts.com/en/read/national" and I'm fetching results of articles with heads Headline and news. I need all the articles on the pages which contain a specific word (e.g., "health") and add "date" on the head.
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# code for scraping the first page
d={'headlines':[],'news':[], 'date':[]}
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
#to search specific word in the content
soup = soup.find_all(text=re.compile("Health"))
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
# code for scraping more pages
for i in tqdm(range(10)):
# It uses JavaScript to load more data from
# https://inshorts.com/en/ajax/more_news using POST requests
# with parameter 'news_offset' which informs server what page
# it has to send to client.
# we can make POST requests with this parameter to get new
# data in JSON format
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
#In JSON you have HTML in json_data['html'] and
#json_data['min_news_id'] for next page
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
except:
pass
# storing the data into .csv file
df = pd.DataFrame(d)
df.to_csv("inshorts_news.csv", index=False)
And here's the error:
AttributeError Traceback (most recent call last)
<ipython-input-2-2d109f9dfc91> in <module>()
12
13 #to search specific word in the content
---> 14 soup = soup.find_all(text=re.compile("Health"))
15
16 for data in soup:
/usr/local/lib/python3.7/dist-packages/bs4/element.py in __getattr__(self, key)
1882 def __getattr__(self, key):
1883 raise AttributeError(
-> 1884 "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
1885 )
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
CodePudding user response:
What happens?
As the error tells you are trying to find_all()
on a ResultSet object, that wont work.
How to fix?
Iterate over the elements of the object and check there for your keyword:
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
# code for scraping the first page
d=[]
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
# code for scraping more pages
for i in tqdm(range(2)):
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
d.append({
'headline': data.find(itemprop="headline").getText(),
'article': data.find(itemprop="articleBody").getText()
})
except Exception as e:
print (e)
pd.DataFrame(d)
Output
headline article
0 Heavy traffic seen on DND Flyway at Noida toll... Heavy traffic was witnessed on Delhi Noida Dir...
1 Farmers take out protest march in Haryana over... Farmers have taken out a protest march in Hary...
2 Akhilesh Yadav detained in Lucknow after sit-i... Samajwadi Party President Akhilesh Yadav was d...
3 Priyanka detained on way to UP's Lakhimpur Khe... Congress leader Priyanka Gandhi Vadra was deta...
4 Rakesh Tikait reaches UP's Lakhimpur Kheri aft... BKU leader Rakesh Tikait reached UP's Lakhimpu...
5 Opposition to start with 'Photo Ops' in Lakhim... Uttar Pradesh Cabinet Minister Sidharth Nath S...