I am trying to do web scraping using BeautifulSoup and requests Python library. I want to filter the news titles from Hacker News website but its showing an error while implementing.
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titleline a')
subtext = soup.select('.subtext')
def create_custom_hn(links, subtext):
hn = []
for index, item in enumerate(links):
title = links[index].getText()
href = links[index].get('href', None)
votes = subtext[index].select('.score')
if len(votes):
points = int(votes[0].getText().replace(' points', ''))
print(points)
hn.append({'title': title, 'href': href})
return hn
print(create_custom_hn(links, subtext))
The error says
votes = subtext[index].select('.score')
~~~~~~~^^^^^^^
IndexError: list index out of range
CodePudding user response:
Here is fixed version of the code from the question:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://news.ycombinator.com/news")
soup = BeautifulSoup(res.text, "html.parser")
links = soup.select(".titleline > a")
def create_custom_hn(links):
hn = []
for link in links:
title = link.getText()
href = link.get("href", None)
votes = link.find_next(class_="score")
points = int(votes.getText().replace(" points", ""))
hn.append({"title": title, "href": href, "points": points})
return hn
print(create_custom_hn(links))
Prints:
[
{
"title": "Urllib3 in 2022",
"href": "https://sethmlarson.dev/urllib3-in-2022",
"points": 97,
},
{
"title": "First public release of Pushup: a new compiler for making web apps in Go",
"href": "https://github.com/adhocteam/pushup",
"points": 18,
},
{
"title": "Intelligence – A good collection of great OSINT Resources",
"href": "https://github.com/ARPSyndicate/awesome-intelligence",
"points": 113,
},
{
"title": "Microsoft is preparing to add ChatGPT to Bing",
"href": "https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter",
"points": 760,
},
...and so on.
CodePudding user response:
Try to select your elements more specific, your selection of soup.select('.titleline a')
includes more elements (60) as you may like to select (30):
[<a href="https://sethmlarson.dev/urllib3-in-2022">Urllib3 in 2022</a>,
<a href="from?site=sethmlarson.dev"><span >sethmlarson.dev</span></a>,...]
I would also recommend to iterate the elements in another way, so you would become able to handle missing values.
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text)
data = []
for e in soup.select('tr.athing'):
data.append({
'title':e.select_one('.titleline a').get_text(),
'url':e.select_one('.titleline a').get('href'),
'votes':e.find_next(class_='subtext').text.split()[0]
})
print(data)
Output
[{'title': 'Urllib3 in 2022', 'url': 'https://sethmlarson.dev/urllib3-in-2022', 'votes': '93'}, {'title': 'First public release of Pushup: a new compiler for making web apps in Go', 'url': 'https://github.com/adhocteam/pushup', 'votes': '16'}, {'title': 'Intelligence – A good collection of great OSINT Resources', 'url': 'https://github.com/ARPSyndicate/awesome-intelligence', 'votes': '109'}, {'title': 'Microsoft is preparing to add ChatGPT to Bing', 'url': 'https://www.bloomberg.com/news/articles/2023-01-04/microsoft-hopes-openai-s-chatbot-will-make-bing-smarter', 'votes': '755'}, {'title': 'Juan Tamariz, the godfather of close-up card magic', 'url': 'https://www.nytimes.com/2023/01/02/magazine/juan-tamariz-magic.html', 'votes': '31'}, {'title': 'The Expanding Dark Forest and Generative AI', 'url': 'https://maggieappleton.com/ai-dark-forest', 'votes': '223'}, {'title': 'Irreconcilable differences between local and distributed computing (1994)', 'url': 'https://scholar.harvard.edu/waldo/publications/note-distributed-computing', 'votes': '29'},...]