So, I'm trying to scrape Twitter followers but the issue is, it scrapes unnecessary links too that are not profile pages (Twitter accs).
What the below code does is, open the Twitter account page that you want to scrape followers from, and gets links of profile pages using locate element by xpath, while gradually scrolling down to get all the present followers.
Here's my code:
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
What would be a more effective way to do this? I want just the usernames, not every unnecessary link.
Full code:
import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
import threading
import time
from time import sleep
import datetime
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("start-maximized")
root = tk.Tk()
app_width = 300
app_height = 320
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x = (screen_width / 2) - (app_width / 2)
y = (screen_height / 2) - (app_height / 2)
root.geometry(f'{app_width}x{app_height} {int(x)} {int(y)}')
#
ser = Service("C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
wait = WebDriverWait(driver, 50)
testbtn_txt = tk.StringVar()
testbtn = tk.Button(root, textvariable=testbtn_txt, command=lambda:extract_followers_func(), font="Arial", bg="#808080", fg="white", height=1, width=10)
testbtn_txt.set("Test")
testbtn.grid(row=10, column=0, columnspan=2, pady=5, padx=5)
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
root.mainloop()
CodePudding user response:
You are almost there!
You just need to finetune the locator.
So, instead of
'//div[@aria-label="Timeline: Followers"]//a[@role="link"]'
You should use
'//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,"search")) and not(contains(@href,"Live")) and not(@rel)]'