I am trying to delete the data within a nav
tag present in scrapped data. I tried several methods and its extracting scuccessfully. But when I try to clean the rest of the data, the data from nav
tag is also appearing. I tried extract
and decompose
but all giving same results.
Code
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.parse
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
service = Service("/home/ubuntu/selenium_drivers/chromedriver")
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3")
options.add_argument("--headless")
options.add_argument('--ignore-certificate-errors')
options.add_argument("--enable-javascript")
options.add_argument('--incognito')
URL = "https://michiganopera.org/season-schedule/frida/"
try:
driver = webdriver.Chrome(service = service, options = options)
driver.get(URL)
driver.implicitly_wait(2)
html_content = driver.page_source
driver.quit()
except WebDriverException:
driver.quit()
soup = BeautifulSoup(html_content, 'html.parser')
z = soup.find("nav",{"class":"nav-main"})
z.extract()
for h in soup.find_all('header'):
try:
h.extract()
except:
pass
for f in soup.find_all('footer'):
try:
f.extract()
except:
pass
try:
cols = soup.find("div",{"class":"modal fade"})
cols.extract()
except:
pass
text = soup.getText(separator=u' ')
print(text)
When we run this code, we will get cleaned data and out of this data there is a portion at the end like below which has to be removed
Section to be removed
Sponsors
Email Sign Up View Calendar
Season & Tickets Season at a Glance MOT at Home Upcoming Dance Theatre of Harlem Calendar Ways to save Subscriptions Groups Gift Certificates Box Office How to Avoid Scalper Tickets Plan Your Visit Parking & Directions Sunday Shuttles Dining Cadillac Café Hotels Opera & Dance Talks FAQ Online Boutique PLAN YOUR EVENT Catering & Events Weddings Corporate & Social Event Sky Deck COVID-19 Safety Plan Get Involved Community Events Young Patrons Circle Opera Teens Opera Clubs Ambassadors Volunteers Dance Film Series Learn Summer Programs Operetta Remix Dance Classes Children’s Choruses For Schools Field Trips In-School Performances Classroom Guides Tours Allesee Resource Library Dance Dialogues MOT Learns at Home Support Annual Fund & DiChiera Society Other Ways to Give Planned Giving David DiChiera Artistic Fund Sponsorship Opportunities Why I give to MOT About Us Our History MOT History DOH History Past Seasons David DiChiera Leadership Board of Directors Wayne S. Brown Yuval Sharon Christine Goerke Admin & Staff Our mission Antiracism Statement of Commitment Opera America Member Musicians Orchestra Roster Chorus Roster Children’s Choruses Non-Profit Status Press
I am facing same issue with several sites. I think I am missing some point here.
Thanks in advance
CodePudding user response:
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.parse
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
service = Service("/home/ubuntu/selenium_drivers/chromedriver")
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3")
options.add_argument("--headless")
options.add_argument('--ignore-certificate-errors')
options.add_argument("--enable-javascript")
options.add_argument('--incognito')
URL = "https://michiganopera.org/season-schedule/frida/"
try:
driver = webdriver.Chrome(service = service, options = options)
driver.get(URL)
driver.implicitly_wait(2)
html_content = driver.page_source
driver.quit()
except WebDriverException:
driver.quit()
soup = BeautifulSoup(html_content, 'html.parser')
z = soup.find("nav",{"class":"nav-main"})
z.extract()
for h in soup.find_all('header'):
try:
h.extract()
except:
pass
for f in soup.find_all('footer'):
try:
f.extract()
except:
pass
try:
cols = soup.find("div",{"class":"modal fade"})
cols.extract()
except:
pass
text = soup.getText(separator=u' ')
sep = 'Sponsors'
stripped = text.split(sep, 1)[0]
print(stripped)