Home > Software engineering >  Unable to scrape texts from URLs
Unable to scrape texts from URLs

Time:12-21

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:


from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd


# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)

time.sleep(3)

# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
                                                           f"//article//*[(name() = 'h1' or name()='h2'  or name()='h3'  or name()='h4' or name()='h5'  or name()='h6'  or name()='h7') and string-length(text()) > 0]/ancestor::article")))

article_link = []
full_text = []
# For every article we take what we want
for article in articles:
    link = article.find_element(By.XPATH, f".//a")
    news_link = link.get_attribute('href')
    article_link.append(news_link)


    for j in article_link:
        news_response = requests.get(j)
        news_data = news_response.content
        news_soup = BeautifulSoup(news_data, 'html.parser')
        art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
        full_text.append(art_cont.text)

print(article_link)
print(full_text)


I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.

CodePudding user response:

First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).

Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.

for link in article_link:
    driver.get(link)
    news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
    full_text.append(news_data[0].get_attribute('textContent'))

The full script would look like this:

from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd


# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)

time.sleep(3)

# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
                                                           f"//article//*[(name() = 'h1' or name()='h2'  or name()='h3'  or name()='h4' or name()='h5'  or name()='h6'  or name()='h7') and string-length(text()) > 0]/ancestor::article")))

article_link = []
full_text = []
# For every article we take what we want
for article in articles:
    link = article.find_element(By.XPATH, f".//a")
    news_link = link.get_attribute('href')
    article_link.append(news_link)


for link in article_link[:5]:
    driver.get(link)
    news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
    full_text.append(news_data[0].get_attribute('textContent'))

print(article_link)
print(full_text)

CodePudding user response:

The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although @Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver

link = 'https://www.iol.co.za/news/south-africa/eastern-cape'

def get_links_and_texts(driver,url):
    driver.get(url)
    for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(@class,'Link__StyledLink')]")))]:
        driver.get(article_link)
        art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
        yield {"Link":article_link,"article_content":art_content}

if __name__ == '__main__':
    with webdriver.Chrome() as driver:
        for item in get_links_and_texts(driver,link):
            print(item)
  • Related