Want to scraping titles, dates, links, and content from IOL website but can't-CodePudding

I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.

The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.

I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.

I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.

Please see my code below:



import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re



art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article


pagesToGet = ['south-africa/eastern-cape']


for i in range(0, len(pagesToGet)):
    print('processing page : \n')
    url = 'https://www.iol.co.za'   str(pagesToGet[i])
    print(url)

    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.maximize_window()

    #time.sleep(5)  # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
    # prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.

    # an exception might be thrown, so the code should be in a try-except block
    try:
        # use the browser to get the url. This is suspicious command that might blow up.
        driver.get("https://www.iol.co.za/news/"  str(pagesToGet[i]))

    except Exception as e:  # this describes what to do if an exception is thrown
        error_type, error_obj, error_info = sys.exc_info()  # get the exception information
        print('ERROR FOR LINK:', url)  # print the link that cause the problem
        print(error_type, 'Line:', error_info.tb_lineno)  # print error info and line that threw the exception
        continue  # ignore this page. Abandon this and go back.
    time.sleep(3) # Allow 3 seconds for the web page to open

    # Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
    k = 1
    while k<=2:
        scroll_pause_time = 1  # You can set your own pause time. My laptop is a bit slow so I use 1 sec
        screen_height = driver.execute_script("return window.screen.height;")  # get the screen height of the web
        i = 1
        while True:
            # scroll one screen height each time
            driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
            i  = 1
            time.sleep(scroll_pause_time)
            # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
            scroll_height = driver.execute_script("return document.body.scrollHeight;")
            # Break the loop when the height we need to scroll to is larger than the total scroll height
            if (screen_height) * i > scroll_height:
                break
        driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
        k  = 1
        time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
    print(len(news))

    # Getting titles, dates, and links
    for j in news:
        # Article title
        title = j.findAll(re.compile('^h[1-6]'))
        for news_title in title:
            art_title.append(news_title.text)

        # Article dates
        dates = j.find('p', attrs={'class': 'sc-cIShpX'})
        if dates is not None:
            date = dates.text
            split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
            art_date.append(split_date)

        # Article links
        address = j.find('a').get('href')
        news_link = 'https://www.iol.co.za'   address
        art_link.append(news_link)

    df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})

    # Getting contents
    new_articles = ...struggling to write the code

    df['Content'] = news_articles


df.to_csv('data.csv')


driver.quit()

CodePudding user response：

I think this is what you are looking for:

# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver

# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)

time.sleep(3)

# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2'  or name()='h3'  or name()='h4' or name()='h5'  or name()='h6'  or name()='h7') and string-length(text()) > 0]/ancestor::article")))

# For every article we take what we want
for article in articles:
    header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2'  or name()='h3'  or name()='h4' or name()='h5'  or name()='h6'  or name()='h7']")
    print(header.get_attribute('textContent'))
    author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2'  or name()='h3'  or name()='h4' or name()='h5'  or name()='h6'  or name()='h7']/following-sibling::p[1]")
    if author_and_date:
        print(author_and_date[0].get_attribute('textContent'))
    else:
        print("No author found")
    link = article.find_element(By.XPATH, f".//a")
    print(link.get_attribute('href'))