Home > database >  Extracting Data from nav Tag with beautifulsoup
Extracting Data from nav Tag with beautifulsoup

Time:11-01

I am trying to delete the data within a nav tag present in scrapped data. I tried several methods and its extracting scuccessfully. But when I try to clean the rest of the data, the data from nav tag is also appearing. I tried extract and decompose but all giving same results.

Code

from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.parse
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service

service = Service("/home/ubuntu/selenium_drivers/chromedriver")

options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3")
options.add_argument("--headless")
options.add_argument('--ignore-certificate-errors')
options.add_argument("--enable-javascript")
options.add_argument('--incognito')

URL = "https://michiganopera.org/season-schedule/frida/"

try:
    driver = webdriver.Chrome(service = service, options = options)
    driver.get(URL)
    driver.implicitly_wait(2)
    html_content = driver.page_source
    driver.quit()
except WebDriverException:
    driver.quit()

soup = BeautifulSoup(html_content, 'html.parser')
z = soup.find("nav",{"class":"nav-main"})
z.extract()
for h in soup.find_all('header'):
    try:
        h.extract()
    except:
        pass
for f in soup.find_all('footer'):
    try:
        f.extract()
    except:
        pass
try:
    cols = soup.find("div",{"class":"modal fade"})
    cols.extract()
except:
    pass
text = soup.getText(separator=u' ')
print(text)

When we run this code, we will get cleaned data and out of this data there is a portion at the end like below which has to be removed

Section to be removed

 Sponsors 
 
 
 
 
 Email Sign Up View Calendar 
 
 
       Season & Tickets   Season at a Glance MOT at Home Upcoming   Dance Theatre of Harlem Calendar Ways to save   Subscriptions Groups Gift Certificates Box Office   How to Avoid Scalper Tickets Plan Your Visit   Parking & Directions   Sunday Shuttles Dining   Cadillac Café Hotels Opera & Dance Talks FAQ Online Boutique PLAN YOUR EVENT   Catering & Events Weddings Corporate & Social Event Sky Deck COVID-19 Safety Plan Get Involved   Community Events Young Patrons Circle Opera Teens Opera Clubs Ambassadors Volunteers Dance Film Series Learn   Summer Programs   Operetta Remix Dance Classes Children’s Choruses For Schools   Field Trips In-School Performances Classroom Guides Tours Allesee Resource Library Dance Dialogues MOT Learns at Home Support   Annual Fund & DiChiera Society Other Ways to Give Planned Giving David DiChiera Artistic Fund Sponsorship Opportunities Why I give to MOT About Us   Our History   MOT History DOH History Past Seasons David DiChiera Leadership   Board of Directors Wayne S. Brown Yuval Sharon Christine Goerke Admin & Staff   Our mission Antiracism Statement of Commitment Opera America Member Musicians   Orchestra Roster Chorus Roster Children’s Choruses Non-Profit Status Press 

I am facing same issue with several sites. I think I am missing some point here.

Thanks in advance

CodePudding user response:

from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.parse
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service

service = Service("/home/ubuntu/selenium_drivers/chromedriver")

options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3")
options.add_argument("--headless")
options.add_argument('--ignore-certificate-errors')
options.add_argument("--enable-javascript")
options.add_argument('--incognito')

URL = "https://michiganopera.org/season-schedule/frida/"

try:
    driver = webdriver.Chrome(service = service, options = options)
    driver.get(URL)
    driver.implicitly_wait(2)
    html_content = driver.page_source
    driver.quit()
except WebDriverException:
    driver.quit()

soup = BeautifulSoup(html_content, 'html.parser')
z = soup.find("nav",{"class":"nav-main"})
z.extract()
for h in soup.find_all('header'):
    try:
        h.extract()
    except:
        pass
for f in soup.find_all('footer'):
    try:
        f.extract()
    except:
        pass
try:
    cols = soup.find("div",{"class":"modal fade"})
    cols.extract()
except:
    pass
text = soup.getText(separator=u' ')
sep = 'Sponsors'
stripped = text.split(sep, 1)[0]
print(stripped)
  • Related