I'm trying to scrape twitter depending on different keywords, I want the script to take the words one by one and clear the search box each time to use the next one, but I have a problem with that
===========================================
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from time import sleep
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
searchbox.clear()
searchbox.send_keys(keyword)
searchbox.send_keys(Keys.RETURN)
sleep(10)
driver.find_element_by_link_text('Latest').click()
sleep(5)
data = []
tweet_ids = set()
Keywoed=keyword
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
while scrolling:
page_info = driver.find_elements_by_xpath('//article[@data-testid="tweet"]')
for info in page_info:
tweet = get_tweet_data(info)
if tweet:
tweet_id = ','.join(map(str, tweet))
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
data.append(tweet)
scroll_attempt = 0
while True:
# check scroll position
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(5)
curr_position = driver.execute_script("return window.pageYOffset;")
if last_position == curr_position:
scroll_attempt = 1
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(5) # attempt another scroll
else:
last_position = curr_position
break
def get_tweet_data(info):
UserName = info.find_element_by_xpath('.//span').text
try:
handle = info.find_element_by_xpath('.//span[contains(text(), "@")]').text
except NoSuchElementException:
return
try:
date = info.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
try:
image_element = info.find_elements_by_css_selector('div[data-testid="tweetPhoto"]')
images = []
for image_div in image_element:
href = image_div.find_element_by_tag_name("img").get_attribute("src")
images.append(href)
except NoSuchElementException:
return
try:
comment = info.find_element_by_xpath('.//div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]').text
except NoSuchElementException:
return
retweet_cnt = info.find_element_by_xpath('.//div[@data-testid="retweet"]').text
like_cnt = info.find_element_by_xpath('.//div[@data-testid="like"]').text
tweet = (comment,UserName,handle,date,images,retweet_cnt, like_cnt)
return tweet
=============================================================
using searchbox.clear() didn't help and it give me the error :
for keyword in keywords:
----> searchbox.clear()
searchbox.send_keys(keyword) def clear(self): """Clears the text if it's a text entry element."""
---> self._execute(Command.CLEAR_ELEMENT)
def get_property(self, name):
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=101.0.4951.54)
CodePudding user response:
You are navigating away from the page - this makes the searchbox element "stale". This means you have navigated away from the page/the searchbox element was no longer visible for ANY period of time.
To solve this issue you must load the page with the searchbox element, rerun the code to find the searchbox element, and then run the code.
I would suggest doing something similar to:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("page_with_searchbox_element")
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
searchbox.clear()
searchbox.send_keys(keyword)
# Continue the rest of the code here...
This will reload the page with each attempt and you should no longer get the stale element exception.
CodePudding user response:
Thank you @Jeremy
it works perfectly after I understand the problem and fix it like this:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("https://twitter.com/search?q=" keyword "&src=typed_query&f=live")