I am attempting to log into LinkedIn to perform webscraping using the following code in Google Collab:
#Import relevant packages
from bs4 import BeautifulSoup as bs
import time
import pandas as pd
import re as re
# Install chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
# set options to be headless, ..
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
browser = webdriver.Chrome('chromedriver',options=options)
browser.get('https://www.linkedin.com/login')
time.sleep(3)
#METHOD 1:
#browser.find_element_by_id('[email protected]').send_keys(email)
#browser.find_element_by_id('T35t11!').send_keys(password)
#browser.find_element_by_id('T35t11!').send_keys(Keys.RETURN)
#METHOD 2:
# locate email form by_class_name
username = browser.find_element_by_class_name('login-email')
# send_keys() to simulate key strokes
username.send_keys('[email protected]')
# locate password form by_class_name
password = browser.find_element_by_class_name('login-password')
# send_keys() to simulate key strokes
password.send_keys('T35t11!')
# locate submit button by_class_name
log_in_button = browser.find_element_by_class_name('login-submit')
# locate submit button by_class_id
log_in_button = browser.find_element_by_class_id('login submit-button')
# locate submit button by_xpath
log_in_button = browser.find_element_by_xpath('//*[@type="submit"]')
# .click() to mimic button click
log_in_button.click()
# Go to webpage
page = "https://www.linkedin.com/company/cgi/"
browser.get(page 'posts/')
SCROLL_PAUSE_TIME = 1.5
# Scroll webpage
height = browser.execute_script("return document.documentElement.scrollHeight")
browser.execute_script("window.scrollTo(0, " str(height) ");")
company_page = browser.page_source
linkedin_soup = bs(company_page.encode("utf-8"), "html")
linkedin_soup.prettify()
containers = linkedin_soup.findAll("div",{"class":"occludable-update ember-view"})
post_dates = []
post_texts = []
for container in containers:
try:
posted_date = container.find("span",{"class":"visually-hidden"})
text_box = container.find("div",{"class":"feed-shared-update-v2__description-wrapper"})
text = text_box.find("span",{"dir":"ltr"})
post_dates.append(posted_date.text.strip())
post_texts.append(text.text.strip())
except:
pass
data = {
"Date Posted": post_dates,
"Post Text": post_texts,
}
df = pd.DataFrame(data)
df
However, I am receiving the following error message:
NoSuchElementException Traceback (most recent call last)
<ipython-input-15-985af2eb8d2f> in <module>()
25
26 # locate email form by_class_name
---> 27 username = browser.find_element_by_class_name('login-email')
28 # send_keys() to simulate key strokes
29 username.send_keys('[email protected]')
3 frames
/usr/local/lib/python3.7/dist-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
241 alert_text = value['alert'].get('text')
242 raise exception_class(message, screen, stacktrace, alert_text) # type: ignore[call-arg] # mypy is not smart enough here
--> 243 raise exception_class(message, screen, stacktrace)
244
245 def _value_or_default(self, obj: Mapping[_KT, _VT], key: _KT, default: _VT) -> _VT:
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".login-email"}
(Session info: headless chrome=95.0.4638.69)
Stacktrace:
#0 0x5577ca240623 <unknown>
#1 0x5577c9f47d43 <unknown>
#2 0x5577c9f7d5f0 <unknown>
#3 0x5577c9fb1337 <unknown>
#4 0x5577c9f9a5fd <unknown>
#5 0x5577c9faf0ac <unknown>
#6 0x5577c9f9a9e3 <unknown>
#7 0x5577c9f71c0c <unknown>
#8 0x5577c9f730d5 <unknown>
#9 0x5577ca264954 <unknown>
#10 0x5577ca273f6d <unknown>
#11 0x5577ca273c8b <unknown>
#12 0x5577ca2745b2 <unknown>
#13 0x5577ca2ace8b <unknown>
#14 0x5577ca274811 <unknown>
#15 0x5577ca259831 <unknown>
#16 0x5577ca27d218 <unknown>
#17 0x5577ca27d3aa <unknown>
#18 0x5577ca2973bf <unknown>
#19 0x7f2cd7d546db <unknown>
I have tried two different methods as shown above which I found from different examples of web-scraping. Would you be able to let me know what the issue might be and what I could do to resolve this please?
Thank you.
CodePudding user response:
Use the URL: https://www.linkedin.com/checkpoint/rm/sign-in-another-account
for logging in on order to avoid the existing user selection page.
There is no element with the class of login-email
on the login page so use username = browser.find_element_by_id('username')
to locate the email field.