I'm using selenium to first ask Google a question and then scrape the first few results. I'm trying to add all URLs, Titles, and Descriptions to a Dict which I can then access later. Unfortunately, I can't get it to work - returns 'No Data Found'. Does anyone have an idea of what may be the issue?
Here is what I'm doing:
options = Options()
options.add_argument("--headless")
def googleSearch(query):
# specifing browser web driver
driver = webdriver.Chrome(options=options, executable_path='chromedriver')
# search query
search_engine = "https://www.google.com/search?q="
query = query.replace(" "," ")
driver.get(search_engine query "&start=" "0")
# stored data
# which will be returned by this function
data = {}
# number of search reasult count of first page
s_len = 5
for s_block in range(s_len):
# result block
content_block_xpath = f'''//*[@id="yuRUbf"]/div[{s_block}]/div/div'''
# xpaths
xpath_url = f"""{content_block_xpath}/div[1]/a"""
xpath_title = f"""{content_block_xpath}/div[1]/a/h3"""
xpath_description = f"""{content_block_xpath}/div[2]/span/span"""
try:
# store data collected of each s_block to block {}
block = {}
# find url of content
url = driver.find_element(By.XPATH, xpath_url)
url = url.get_attribute('href')
links.append(url.get('href'))
# find domain name of web having content
pattern = r"""(https?:\/\/)?(([a-z0-9-_] \.)?([a-z0-9-_] \.[a-z0-9-_] ))"""
domain = re.search(pattern, url)[0]
print(links)
# find title of content
# title = driver.find_element_by_xpath(xpath_title)
title = driver.find_element(By.XPATH, xpath_title)
title = title.get_attribute("innerText")
# find description of content
# description = driver.find_element_by_xpath(xpath_description)
description = driver.find_element(By.XPATH, xpath_description)
description = description.get_attribute("innerText")
# save all data to block {}
block["domain"] = domain
block["url"] = url
block["title"] = title
block["description"] = description
# save block dictionary to main dictionary
data[f'{s_block}'] = block
except exceptions.NoSuchElementException:
continue
if len(data) == 0:
raise Exception("No data found")
driver.close()
return data
def getQuery():
query = str('How to change a car tire')
link = googleSearch(query)
print(link)
getQuery()
CodePudding user response:
I see two problems:
- a mix-up with class and id regarding the use of "yuRUbf"
- indexing in xpath starts at 1 and not 0
content_block_xpath = f'''(//*[@])[{s_block}]'''
xpath_url = f"""{content_block_xpath}/a"""
xpath_title = f"""{content_block_xpath}/a/h3"""
xpath_description = f"""{content_block_xpath}/a//cite/span"""