I am trying to crawl this site "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003" but getting only header and few body responses, unable to get full paragraph content and links of pages.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003&page=1")
print(driver.page_source)
driver.quit()
So the response has no href and tags that what I need
CodePudding user response:
Pass some experimental options and scrape the data
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# set chrome options and run headless
chrome_options = Options()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation",
'disable-component-update',
'ignore-certificate-errors'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get("https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12527-Artificial-intelligence-ethical-and-legal-requirements/feedback_en?p_id=24212003&page=1")
soup = BeautifulSoup(driver.page_source, 'html')
driver.quit()
# find all <a> tags and get the href using dict comprehension
d = {x.text: x['href'] for x in soup.find_all('a', href=True)}