Scrape traffic data with Selenium that require login-CodePudding

I need to scrape Authority score, Organic Search Traffic, Backlinks from burton.com using Selenium.

below script gives some errors Semrush image to be scraped

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service


options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

url = 'https://www.semrush.com/analytics/overview/?q=burton.com&searchType=domain' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
driver = webdriver.Chrome(executable_path='c:\chromedriver.exe',chrome_options=options)

#note: executable_path will depend on where your chromedriver.exe is located

driver.get(url) #get response
driver.implicitly_wait(1) #wait to load content
elements = driver.find_element("xpath", '//a[@href="/info/burton.com (by organic)"]') #grab that stuff you wanted?  

for e in elements: print(e.get_attribute('text').strip()) #print text fields

driver.quit() #close the driver when you're done

Below is my error on Visual studio code Semrush needs login with a free trial to see above data, does it create the problem here.

PS C:\Users\akein> & C:/Python310/python.exe c:/Users/akein/OneDrive/Desktop/aaa.py
c:\Users\akein\OneDrive\Desktop\aaa.py:12: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
  driver = webdriver.Chrome(executable_path='c:\chromedriver.exe',chrome_options=options)
c:\Users\akein\OneDrive\Desktop\aaa.py:12: DeprecationWarning: use options instead of chrome_options
  driver = webdriver.Chrome(executable_path='c:\chromedriver.exe',chrome_options=options)

DevTools listening on ws://127.0.0.1:50030/devtools/browser/6a717a35-4404-46d0-b2df-fa1ba06fbb3d
[1008/234714.670:INFO:CONSOLE(2)] "limitPopup", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.006:INFO:CONSOLE(2)] "SyntaxError: Unexpected token 'B', "Bad Request
" is not valid JSON", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.058:INFO:CONSOLE(2)] "dataLayerProxy:  prop [[getByName]] is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.059:INFO:CONSOLE(2)] "dataLayerProxy:  method call is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)       
[1008/234715.059:INFO:CONSOLE(2)] "dataLayerProxy:  prop [[getByName]] is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.060:INFO:CONSOLE(2)] "dataLayerProxy:  method call is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)       
[1008/234715.060:INFO:CONSOLE(2)] "dataLayerProxy:  prop [[getByName]] is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.060:INFO:CONSOLE(2)] "dataLayerProxy:  method call is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)       
[1008/234715.068:INFO:CONSOLE(2)] "dataLayerProxy:  prop [[getByName]] is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.068:INFO:CONSOLE(2)] "dataLayerProxy:  method call is not supported", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
[1008/234715.433:INFO:CONSOLE(2)] "SSO Frontend. You are using old value for defaultActiveTab parameter.
      Please use loginForm instead of login.
      For more information see the documentation.", source: https://static.semrush.com/domain-overview/vendor.2365e1d7f296adbbe3c8.chunk.js (2)
Traceback (most recent call last):
  File "c:\Users\akein\OneDrive\Desktop\aaa.py", line 18, in <module>
    elements = driver.find_element("xpath", '//a[@href="/info/burton.com (by organic)"]') #grab that stuff you wanted?
  File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 856, in find_element
    return self.execute(Command.FIND_ELEMENT, {
  File "C:\Python310\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 429, in execute
    self.error_handler.check_response(response)
  File "C:\Python310\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 243, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//a[@href="/info/burton.com (by organic)"]"}
  (Session info: headless chrome=106.0.5249.103)
Stacktrace:
Backtrace:
        Ordinal0 [0x00D71ED3 2236115]
        Ordinal0 [0x00D092F1 1807089]
        Ordinal0 [0x00C166FD 812797]
        Ordinal0 [0x00C455DF 1005023]
        Ordinal0 [0x00C457CB 1005515]
        Ordinal0 [0x00C77632 1209906]
        Ordinal0 [0x00C61AD4 1120980]
        Ordinal0 [0x00C759E2 1202658]
        Ordinal0 [0x00C618A6 1120422]
        Ordinal0 [0x00C3A73D 960317]
        Ordinal0 [0x00C3B71F 964383]
        GetHandleVerifier [0x0101E7E2 2743074]
        GetHandleVerifier [0x010108D4 2685972]
        GetHandleVerifier [0x00E02BAA 532202]
        GetHandleVerifier [0x00E01990 527568]
        Ordinal0 [0x00D1080C 1837068]
        Ordinal0 [0x00D14CD8 1854680]
        Ordinal0 [0x00D14DC5 1854917]
        Ordinal0 [0x00D1ED64 1895780]
        BaseThreadInitThunk [0x7666FA29 25]
        RtlGetAppContainerNamedObjectPath [0x77427A9E 286]
        RtlGetAppContainerNamedObjectPath [0x77427A6E 238]

a-- ---- ---- --- ---a --- -- -- --- --b ----- ---- -- - - - - - - - - - - - - - - - - - - - - - -

- - - - -- - - -s- - - - - - - -- - - - - - - - - - - - - - - - - - - - - - -- - - - -- - - - - - - - - - - - - --

CodePudding user response：

The error no such element: Unable to locate element: {"method":"xpath","selector":"//a[@href="/info/burton.com (by organic)"]" is caused since the above xpath is not present on the page

Firstly on visiting the site we need to Log In

Post that to extract Authority score, Organic Search Traffic, Backlinks form site we can use the label for those fields and find the values as a relative value to the field (since the value field does not have any specific id)

Your solution would look like

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

url = 'https://www.semrush.com/analytics/overview/? 
q=burton.com&searchType=domain' #your url
options = Options() #set up options
options.add_argument('--headless') #add --headless mode to options
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(executable_path='c:\chromedriver.exe',chrome_options=options) 
driver.get(url) #get response
# Login to website
driver.find_element(By.XPATH, "//span[contains(text(), 'Log In')]").click()
driver.find_element(By.ID, "email").send_keys("your usename")
driver.find_element(By.ID, "password").send_keys("your password")
driver.find_element(By.XPATH, "//div[contains(text(), 'Log in')]").click()
# Ensure user is logged in
WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 
"use")))
 # Add label of the field for which details need to be fetched
 tags = ['Authority Score', 'Organic search traffic', 'Backlinks']
for tag in tags:
   print(driver.find_element(By.XPATH, f"//span[contains(text(), '{tag}')]/ancestor::div[@direction='column']/descendant::a[@data-at='main-number']/span").text)
 driver.quit() #close the driver when you're done

CodePudding user response：

You need to log in the page first, then use XPath to find the element, or you can attach to existing opening website page. For attaching to existing browser, you can use clicknium.

from clicknium import clicknium as cc
# first install chrome extension
cc.chrome.extension.install()
# then attach to browser with url, use wildcard(*) if part of the url may change
tab = cc.chrome.attach_by_title_url(url="https://www.semrush.com/analytics/overview/? q=burton.com&searchType=domain*")
elements = tab.find_elements_by_xpath('//a[@href="/info/burton.com (by organic)"]')