I need to do web scraping to googlenews to get the link for different articles from different newspaper and I have a code that works pretty fine for today news (from googlenews). However it doesn't work for older articles. For example this code works to get different article links from google news:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
import time
from newspaper import Article
import random
import pandas as pd
root = 'https://www.google.com/'
time.sleep(random.randint(0, 3)) #----------stop---------#
link = 'https://www.google.com/search?q=revuelta la tercera&rlz=1C1UEAD_esCL995CL995&biw=1536&bih=714&tbm=nws&ei=qEWUYorfOuiy5OUP-aGLgA4&ved=0ahUKEwiK07Wfr4b4AhVoGbkGHfnQAuAQ4dUDCA0&uact=5&oq=revuelta la tercera&gs_lcp=Cgxnd3Mtd2l6LW5ld3MQAzIFCCEQoAEyBQghEKABOgsIABCABBCxAxCDAToFCAAQgAQ6CAgAEIAEELEDOggIABCxAxCDAToKCAAQsQMQgwEQQzoECAAQQzoECAAQCjoGCAAQHhAWOggIABAeEA8QFlDIEliUnwFg1aABaAVwAHgAgAGSAYgBuw-SAQQyMS4ymAEAoAEBsAEAwAEB&sclient=gws-wiz-news'
time.sleep(random.randint(0, 6)) #----------stop---------#
req = Request(link, headers = {'User-Agent': 'Mozilla/5.0'})
time.sleep(random.randint(0, 3)) #----------stop---------#
requests.get(link, headers = {'User-agent': 'your bot 0.1'})
time.sleep(random.randint(0, 6)) #----------stop---------#
webpage = urlopen(req).read()
time.sleep(random.randint(0, 6)) #----------stop---------#
with requests.Session() as c:
soup = BeautifulSoup(webpage, 'html5lib')
for item in soup.find_all('div', attrs= {'class': 'ZINbbc luh4tb xpd O9g5cc uUPGi'}):
raw_link = item.find('a', href=True)['href']
link = raw_link.split('/url?q=')[1].split('&sa=U&')[0]
article = Article(link, language = "es")
article.download()
article.parse()
title = article.title
descript = article.text
date = article.publish_date
print(title)
print(descript)
print(link)
Now I need to change the dates for the same search, so I just change the link with the custom interval:
root = 'https://www.google.com/'
time.sleep(random.randint(0, 3)) #----------stop---------#
link = 'https://www.google.com/search?q=revuelta la tercera&rlz=1C1UEAD_esCL995CL995&biw=1536&bih=714&source=lnt&tbs=cdr:1,cd_min:1/1/2018,cd_max:1/6/2018&tbm=nws'
time.sleep(random.randint(0, 6)) #----------stop---------#
req = Request(link, headers = {'User-Agent': 'Mozilla/5.0'})
time.sleep(random.randint(0, 3)) #----------stop---------#
requests.get(link, headers = {'User-agent': 'your bot 0.1'})
time.sleep(random.randint(0, 6)) #----------stop---------#
webpage = urlopen(req).read()
time.sleep(random.randint(0, 6)) #----------stop---------#
with requests.Session() as c:
soup = BeautifulSoup(webpage, 'html5lib')
for item in soup.find_all('div', attrs= {'class': 'ZINbbc luh4tb xpd O9g5cc uUPGi'}):
raw_link = item.find('a', href=True)['href']
link = raw_link.split('/url?q=')[1].split('&sa=U&')[0]
article = Article(link, language = "es")
article.download()
article.parse()
title = article.title
descript = article.text
date = article.publish_date
print(title)
print(descript)
print(link)
The links are supposed to be different (due to the change of search dates) but they both give me the same result and I don't understand why. Please help guys, I have no idea how to fix that.
CodePudding user response:
By the URL you provided is
If you read it carefully cd_min and cd_max seems to include datetime data.
cd_min:1/1/2018,cd_max:1/6/2018
So let's slice them from URL. String above is sliced from the URL and it is URL encoded. If you decode it you will see..
cd_min:1/1/2018,cd_max:1/6/2018
So If you want to change date for your query, you should make change to URL.
from urllib import parse
# URL ENCODE
# DON'T FORGET : and ,
start_date = parse.quote_plus(":1/1/2018,")
end_date = parse.quote_plus(":1/6/2018")
# CREATE QUERY
link = f"https://www.google.com/search?q=revuelta la tercera&rlz=1C1UEAD_esCL995CL995&biw=1536&bih=714&source=lnt&tbs=cdr:1,cd_min{start_date}cd_max{end_date}&tbm=nws"
The code to randomize date is your job :)