How do I scrape Twitter usernames using Selenium properly?-CodePudding

So, I'm trying to scrape Twitter followers but the issue is, it scrapes unnecessary links too that are not profile pages (Twitter accs).

What the below code does is, open the Twitter account page that you want to scrape followers from, and gets links of profile pages using locate element by xpath, while gradually scrolling down to get all the present followers.

Here's my code:

def extract_followers_func():
    driver.get("https://twitter.com/Username/followers")
    sleep(5)
    for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
        file = open("scrapedlist.txt", "a")
        file.write(twusernames.get_property('href'))
        file.write("\n")
        file.close()
    sleep(5)
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        sleep(5)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
            file = open("scrapedlist.txt", "a")
            file.write(twusernames.get_property('href'))
            file.write("\n")
            file.close()

What would be a more effective way to do this? I want just the usernames, not every unnecessary link.

Full code:

import tkinter as tk

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.chrome.service import Service

from selenium.common.exceptions import TimeoutException

import threading

import time

from time import sleep

import datetime

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("start-maximized")

root = tk.Tk()

app_width = 300
app_height = 320

screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()

x = (screen_width / 2) - (app_width / 2)
y = (screen_height / 2) - (app_height / 2)

root.geometry(f'{app_width}x{app_height} {int(x)} {int(y)}')

#
ser = Service("C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
wait = WebDriverWait(driver, 50)

testbtn_txt = tk.StringVar()
testbtn = tk.Button(root, textvariable=testbtn_txt, command=lambda:extract_followers_func(), font="Arial", bg="#808080", fg="white", height=1, width=10)
testbtn_txt.set("Test")
testbtn.grid(row=10, column=0, columnspan=2, pady=5, padx=5)


def extract_followers_func():
    driver.get("https://twitter.com/Username/followers")
    sleep(5)
    for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
        file = open("scrapedlist.txt", "a")
        file.write(twusernames.get_property('href'))
        file.write("\n")
        file.close()
    sleep(5)
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        sleep(5)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
            file = open("scrapedlist.txt", "a")
            file.write(twusernames.get_property('href'))
            file.write("\n")
            file.close()



root.mainloop()

CodePudding user response：

You are almost there!
You just need to finetune the locator.
So, instead of

'//div[@aria-label="Timeline: Followers"]//a[@role="link"]'

You should use

'//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,"search")) and not(contains(@href,"Live")) and not(@rel)]'