Home > Software engineering >  How to scrape a URL on a page only once?
How to scrape a URL on a page only once?

Time:11-09

I'm not sure how I can count a URL on a page just once.

For example, this page https://www.ig.com/uk/news-and-trade-ideas/ includes the article https://www.ig.com/uk/news-and-trade-ideas/early-morning-call--221103 4 times in different sections.

How can I record it just once.

from cgitb import text
import requests
from bs4 import BeautifulSoup
import gspread
import datetime
import urllib
from urllib.parse import urlparse

# Connect to Google Sheet and select sheet
gc = gspread.service_account(filename='creds.json')
sh = gc.open('scrapetosheets').sheet1

# Add URLs to inspect
urls = ["https://www.ig.com/uk/trading-strategies",
"https://www.ig.com/uk/news-and-trade-ideas",
"https://www.ig.com/us/trading-strategies",
"https://www.ig.com/us/news-and-trade-ideas",
"https://www.ig.com/en/trading-strategies",
"https://www.ig.com/en/news-and-trade-ideas",
"https://www.ig.com/za/trading-strategies",
"https://www.ig.com/za/news-and-trade-ideas",
"https://www.ig.com/au/trading-strategies",
"https://www.ig.com/au/news-and-trade-ideas",
"https://www.ig.com/fr/strategies-de-trading",
"https://www.ig.com/fr/marche-actualites-et-idees-de-trading",
"https://www.ig.com/de/trading-strategien",
"https://www.ig.com/de/nachrichten-und-trading-ideen",
"https://www.ig.com/it/strategie-di-trading", 
"https://www.ig.com/it/news-e-idee-di-trading",
"https://www.ig.com/es/estrategias-de-trading",
"https://www.ig.com/es/ideas-de-trading-y-noticias",
"https://www.ig.com/en-ch/trading-strategies",
"https://www.ig.com/en-ch/news-and-trade-ideas",
"https://www.ig.com/cn/trading-strategies",
"https://www.ig.com/cn/news-and-trade-ideas",
"https://www.ig.com/se/tradingstrategier",
"https://www.ig.com/se/nyheter-och-trading-ideer",
"https://www.ig.com/nl/nieuws-en-trading-ideeen",
"https://www.ig.com/nl/trading-strategieen",
"https://www.ig.com/jp/trading-strategies",
"https://www.ig.com/jp/news-and-trade-ideas"]

# Add array
obj = {r[2]: True for r in sh.get_all_values()}
ar = []

for url in urls:
    my_url = requests.get(url)
    html = my_url.content
    soup = BeautifulSoup(html, "html.parser")
    for item in soup.find_all("h3", class_="article-category-section-title"):
        date = datetime.date.today()
        title = item.find("a", class_="primary js_target").text.strip()
        url = item.find("a", class_="primary js_target").get("href")
        abs = "https://www.ig.com"
        rel = url
        pub = rel[-6:]
        datestring = f"{pub[4:6]} {pub[2:4]} {pub[0:2]}"
        info = {"date": date, "title": title, "url":urllib.parse.urljoin(abs, rel), "published":datestring}
        url = str(info["url"].replace("https://",""))
        if url not in obj:
            ar.append([str(info["date"]), str(info["title"]), url, str(info["published"])])
if ar != []:
    sh.append_rows(ar, value_input_option="USER_ENTERED")sHH"

CodePudding user response:

There are several approaches:

  • Append urls to list and lookup while iterating if actual url is not in list else skip scraping it.

  • Or as mentioned use set and operate from the links:

    articles = []
    
    for e in set(soup.select('h3>a')):
        e = e.find_parent('div')
        articles.append({
            'url':e.a.get('href'),
            'title':e.get_text(strip=True),
            'date':e.select_one('.article-category-section-date').get_text(strip=True) if e.select_one('.article-category-section-date') else None
        })
    articles
    
  • Or collect your information in list of dicts and iterate over values to get unique one:

    list({v['url']:v for v in articles}.values())
    
  • ...

Example

import requests
from bs4 import BeautifulSoup

r = requests.get('https://www.ig.com/uk/news-and-trade-ideas/')
soup = BeautifulSoup(r.content)

articles = []

for e in soup.select('h3:has(>a)'):
    articles.append({
        'url':e.a.get('href'),
        'title':e.get_text(strip=True)
    })

print('Whit duplicates: ',len(articles))
print('Whitout duplicates: ', len(list({v['url']:v for v in articles}.values())))


list({v['url']:v for v in articles}.values())

Output

With duplicates:  36
Without duplicates:  29
[{'url': '/uk/news-and-trade-ideas/_brent-crude-oil--gold-and-us-natural-gas-rallies-pause-amid-us--221108',
  'title': '\u200bBrent crude oil, gold and US natural gas ral...'},
 {'url': '/uk/news-and-trade-ideas/early-morning-call--gloomy-festive-season-ahead-amid-consumer-we-221108',
  'title': 'Early Morning Call: dollar basket steady ahead of ...'},
 {'url': '/uk/news-and-trade-ideas/nasdaq-listed-ryanair-posts-record-h1-results-221107',
  'title': 'Ryanair shares up after record H1 result...'},...]
  • Related