When I try to run the below code. I tried to run the download library like beautifulsoap but something is missing from my end. Can anyone just check if I am doing the right thing? My simple motto is to download all pdf files from a website and save it to specific directory
import os
import requests
import ssl
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://sedar.com/DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202"
headers = {
'Host': 'sedar.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'TE': 'trailers'
}
#If there is no such folder, the script will create one automatically
folder_location = r'C:\Users\jay_patel1\...\test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
response = requests.get(url=url, headers=headers)
soup= BeautifulSoup(response.text, "html.parser")
forms = soup.find_all('form')
print(len(forms))
counter = 0
for form in forms:
action = form['action']
doc_link = base_url action
filename = os.path.join(folder_location,str(counter) '.pdf')
counter = counter 1
headers['Cookie'] = '__uzma=28481475-7c5c-4e62-ad08-ebabddb1fff0; __uzmb=1657722275; __uzme=5741; __uzmc=7155215141911; __uzmd=1657726129; TS015c16dc=016abe8a18b65821e9668faf87606307d873066e84546d78152ebf0e1734d6f553b800143820e548c5ead2bd19b0cd0d8c6d8b4fc0937d7b83d0619de26e621bf2db8d6741; __ssds=2; __ssuzjsr2=a9be0cd8e; __uzmaj2=5c5b0a58-f490-41e6-945d-6a4766be7d82; __uzmbj2=1657722277; __uzmcj2=158447041868; __uzmdj2=1657726118; JSESSIONID=0000Hh7sa7ec87A8vlJsaqRiHhX:1884ter20'
with open(filename, 'wb') as f :
print("Writing")
print(doc_link)
content = requests.get(url=doc_link, headers=headers).content
#print(content)
f.write(content)
f.close()
I am getting the below error. I had import SSL but still error is shwoing failure
Traceback (most recent call last):
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connection.py", line 416, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 1040, in _create
self.do_handshake()
File "C:\Users\jay_patel1\Anaconda3\lib\ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\urllib3\util\retry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
MaxRetryError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\jay_patel1\Downloads\pdf download (1).py", line 36, in <module>
response = requests.get(url=url, headers=headers)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:\Users\jay_patel1\Anaconda3\lib\site-packages\requests\adapters.py", line 514, in send
raise SSLError(e, request=request)
SSLError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
CodePudding user response:
Try this
requests.get('https://url.com', verify=False)
CodePudding user response:
You're likely failing to get the SSL handshake completed on your code, are you on a corporate network / behind a proxy?
Adding verify=False to requests.get will get you past the issue, but that call is now insecure.
folder_location = r'C:\Users\jay_patel1\...\test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
response = requests.get(url=url, headers=headers, verify=False)
Ideally, you'd download the SSL cert from the website in question and pass that path into the verify parameter - more information here -> https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification