I want to remove the header and footer section if available in a scraped data.
Code
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
options.add_argument("--headless")
service = Service("/home/ubuntu/selenium_drivers/chromedriver")
URL = "https://www.uh.edu/kgmca/music/events/calendar/?view=e&id=30723#event"
try:
driver = webdriver.Chrome(service = service, options = options)
driver.get(URL)
driver.implicitly_wait(2)
html_content = driver.page_source
driver.quit()
except WebDriverException:
driver.quit()
soup = BeautifulSoup(html_content)
text = soup.getText(separator=u' ')
I tried removing tags but its not working. How it can be achieved.
NB: Please upvote the question so that I will get more features from stackoverflow.
Thanks in Advance
CodePudding user response:
Option 1:
Just get the element and use .extract()
.
Option 2:
The <main>
tag is right in between the <header>
and <footer>
tags. Provided you only want that part, you could just say:
main = soup.find('main')
Also, any reason you're using Selenium? Doesn't simply using requests
do the trick?
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
options.add_argument("--headless")
service = Service("/home/ubuntu/selenium_drivers/chromedriver")
URL = "https://www.uh.edu/kgmca/music/events/calendar/?view=e&id=30723#event"
try:
driver = webdriver.Chrome(service = service, options = options)
driver.get(URL)
driver.implicitly_wait(2)
html_content = driver.page_source
driver.quit()
except WebDriverException:
driver.quit()
soup = BeautifulSoup(html_content)
text = soup.getText(separator=u' ')
for each in ['header','footer']:
s = soup.find(each)
s.extract()