I have the below code which extracts url and title of the google search page. However I am unable to extract the description.
I tried using .find
with div and class but I get back empty list or None
.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
base = "https://www.google.de"
link = "https://www.google.de/search?q={}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
def grab_content(link):
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
for container in soup.select("[class='g'] a[href^='http'][data-ved]:has(h3)"):
post_title = container.select_one("h3").get_text(strip=True)
post_description = container.select_one("div", {"class": "VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf"}).get_text()
post_link = container.get('href')
yield post_title,post_link,post_description
next_page = soup.select_one("a[href][id='pnnext']")
if next_page:
next_page_link = urljoin(base,next_page.get("href"))
yield from grab_content(next_page_link)
if __name__ == '__main__':
search_keyword = "python"
qualified_link = link.format(search_keyword.replace(" "," "))
for item in grab_content(qualified_link):
print(item)
My output: ('Welcome to Python.org', 'https://www.python.org/', 'https://www.python.org') I am getting title, and url but I am unable to get description which will be something like
The official home of the Python Programming Language.
CodePudding user response:
You can try this:
post_description = container.select_one("div.VwiC3b.yXK7lf.MUxGbd.yDYNvb.lyLwlc.lEBKkf").get_text()
CodePudding user response:
Instead selecting by class
that may can change, start from a known part for example the container
- Select its parent
and grab next_siblings
text
post_description = container.parent.next_sibling.get_text()
Output
('Welcome to Python.org', 'https://www.python.org/', 'The official home of thePythonProgramming Language.')
...