I'm trying to collect information from a cloudflare-protected website I believe. I've tried three alternatives and they all return empty values. So, I don't know if the site has any blockages or if I'm doing something wrong.
--Update
The solution proposed by F.Hoque works, however, when I try to use it in Colab, I only get an empty value.
Using request
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.portaldoholanda.com.br/assaltante-surra/com-pedacos-de-madeira-populares-dao-surra-em-homem-assalt'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
soup.find('h1',class_="noticia titulo").text # I tried with select too (soup.select('[]'))
Using cloudscraper
import cloudscraper
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
soup = BeautifulSoup(scraper.get(url, headers=headers).content, "html.parser")
soup.find('h1',class_="noticia titulo").text
Using selenium
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import InvalidSessionIdException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('--ignore-ssl-errors')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
river = webdriver.Chrome(options=options, executable_path='/usr/bin/chromedriver')
print("Current session is {}".format(driver.session_id))
driver.get(url)
html = BeautifulSoup(driver.page_source)
innerContent = html.find('h1',class_="noticia titulo").text
CodePudding user response:
Yes,the website is using cloudflare protection.
https://www.portaldoholanda.com.br/assaltante-surra/com-pedacos-de-madeira-populares-dao-surra-em-homem-assalt is using Cloudflare CDN/Proxy!
https://www.portaldoholanda.com.br/assaltante-surra/com-pedacos-de-madeira-populares-dao-surra-em-homem-assalt is using Cloudflare SSL!
Here is the working solution using cloudScraper
instead of requests
.
Script:
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = "https://www.portaldoholanda.com.br/assaltante-surra/com-pedacos-de-madeira-populares-dao-surra-em-homem-assalt"
req= scraper.get(url)
#print(req)
soup = BeautifulSoup(req.content, "html.parser")
txt=soup.find('h1',class_="noticia titulo").text
print(txt)
Output:
Com pedaços de madeira, populares dão surra em homem em Manaus; veja vídeo