I am trying to download all PDF from a link But i am getting error using Spyder-CodePudding

When I try to run the below code. I tried to run the download library like beautifulsoap but something is missing from my end. Can anyone just check if I am doing the right thing? My simple motto is to download all pdf files from a website and save it to specific directory

import os
import requests
import ssl
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://sedar.com/DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202"
headers = {
'Host': 'sedar.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'TE': 'trailers'
 }

#If there is no such folder, the script will create one automatically
folder_location = r'C:\Users\jay_patel1\...\test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
    os.mkdir(folder_location)

response = requests.get(url=url, headers=headers)

soup= BeautifulSoup(response.text, "html.parser")

forms = soup.find_all('form')
print(len(forms))
counter = 0
for form in forms:
    action = form['action']
    doc_link = base_url action
    filename = os.path.join(folder_location,str(counter) '.pdf')
    counter = counter   1
    headers['Cookie'] = '__uzma=28481475-7c5c-4e62-ad08-ebabddb1fff0; __uzmb=1657722275; __uzme=5741; __uzmc=7155215141911; __uzmd=1657726129; TS015c16dc=016abe8a18b65821e9668faf87606307d873066e84546d78152ebf0e1734d6f553b800143820e548c5ead2bd19b0cd0d8c6d8b4fc0937d7b83d0619de26e621bf2db8d6741; __ssds=2; __ssuzjsr2=a9be0cd8e; __uzmaj2=5c5b0a58-f490-41e6-945d-6a4766be7d82; __uzmbj2=1657722277; __uzmcj2=158447041868; __uzmdj2=1657726118; JSESSIONID=0000Hh7sa7ec87A8vlJsaqRiHhX:1884ter20'
    with open(filename, 'wb') as f :
        print("Writing")
        print(doc_link)
        content = requests.get(url=doc_link, headers=headers).content
        #print(content)
        f.write(content)
        f.close()

I am getting the below error. I had import SSL but still error is shwoing failure

Traceback (most recent call last):

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 382, in _make_request
    self._validate_conn(conn)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
    conn.connect()

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connection.py", line 416, in connect
    self.sock = ssl_wrap_socket(

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=server_hostname)

  File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(

  File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 1040, in _create
    self.do_handshake()

  File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()

SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)


During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\adapters.py", line 439, in send
    resp = conn.urlopen(

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 755, in urlopen
    retries = retries.increment(

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\retry.py", line 574, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))

MaxRetryError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))


During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "C:\Users\jay_patel1\Downloads\pdf download (1).py", line 36, in <module>
    response = requests.get(url=url, headers=headers)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\sessions.py", line 542, in request
    resp = self.send(prep, **send_kwargs)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\sessions.py", line 655, in send
    r = adapter.send(request, **kwargs)

  File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\adapters.py", line 514, in send
    raise SSLError(e, request=request)

SSLError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))

CodePudding user response：

Try this

requests.get('https://url.com', verify=False)

CodePudding user response：

You're likely failing to get the SSL handshake completed on your code, are you on a corporate network / behind a proxy?

Adding verify=False to requests.get will get you past the issue, but that call is now insecure.

folder_location = r'C:\Users\jay_patel1\...\test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
    os.mkdir(folder_location)

response = requests.get(url=url, headers=headers, verify=False)

Ideally, you'd download the SSL cert from the website in question and pass that path into the verify parameter - more information here -> https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification