I'm trying to download the pdf of a research article from researchgate.net
using python's requests
, using the following codes:
import requests
import shutil
dlink = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"
r = requests.get(dlink, stream=True,
headers={
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53',
"sec-ch-ua-platform": '"Windows"',
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="101", "Microsoft Edge";v="101"',
"sec-ch-ua-mobile": '?0',
"Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Dest": "document",
"Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Cookie": "did=zp8vP15CGPkm8uJhry60BLeiaOSKZW8p51UupMljAdqP7WMnERbTEfnOEdkUTTE6; ptc=RG1.81179579133230362.1653300988; __cf_bm=VPc7zYTygZZxFutIAXPHQpRxUBRoKo5DuX4.nIiFqY4-1653300988-0-AS9iIVhVUqqi7Se51WNYTtVymQf9Bcoz4j93uLiq8GlgX73WKpoRJDlUdo0r3fsgmlKXLudkfYdv3NWQ3R10So0=; sid=JwZkeNfoeKESm08f4EtenQVKDk2nKtHGd7oj2rLJlkcwYIZBWff4LHaT6fuoKhSCkGzqyR0Hw5NoUrpTSJnIQsBdhyT7H4gjE0OLNSdOG1q6SSiq5rFPqb1UCjy8nmQS",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "max-age=0",
}
)
print(r.request.headers)
print()
if r.status_code == 200:
with open("x.pdf", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
print("Error downloading the file")
print(r.status_code)
print(r.reason)
The header I'm using here is collected from the Edge browser's request header section in incognito mode.
Thanks for your attention.
CodePudding user response:
Forbidden was caused because of TLS verification failure. I had to change the TLS adapter to make it work.
import ssl
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util import ssl_
CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"
class TLSAdapter(HTTPAdapter):
def __init__(self, ssl_options=0, *args, **kwargs):
self.ssl_options = ssl_options
super().__init__(*args, **kwargs)
def init_poolmanager(self, *args, **kwargs):
context = ssl_.create_urllib3_context(ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
self.poolmanager = PoolManager(*args, ssl_context=context, **kwargs)
url = "https://www.researchgate.net/profile/Corina-Florescu/publication/318596725_PositionRank_An_Unsupervised_Approach_to_Keyphrase_Extraction_from_Scholarly_Documents/links/5972182c0f7e9b40168fe63d/PositionRank-An-Unsupervised-Approach-to-Keyphrase-Extraction-from-Scholarly-Documents.pdf"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
}
adapter = TLSAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
with requests.session() as session:
session.mount("https://www.researchgate.net/", adapter)
response = session.get(url, headers=headers)
print(response.status_code) # 200