I am trying to download Excel files from the website. My code below:
import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib
headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")
for link in soup.find_all('a', href=True):
# print(link)
if 'xlsx' in link['href']:
print(link['href'])
url="https://www.elections.on.ca/" link['href']
# print(url)
file= url.split("/")[-1].split(".")[0] ".xlsx"
# print(file)
urllib.request.urlretrieve(url, file)
However, I get the following error when https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx is trying to be opened
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
8 file= url.split("/")[-1].split(".")[0] ".xlsx"
9 # print(file)
---> 10 urllib.request.urlretrieve(url, file)
...
UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).
EDIT: I tried the safeStr
solution form UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128), but it does not work. Please see below:
def safeStr(obj):
try: return str(obj).encode('ascii', 'ignore').decode('ascii')
except: return ""
url="https://www.elections.on.ca/" '/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx'
# print(url)
print(url)
file= url.split("/")[-1].split(".")[0] ".xlsx"
url = safeStr(url)
print(url)
# print(file)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orlans 076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
6 print(url)
7 # print(file)
----> 8 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I tried another solution from problem of urlretrieve cannot get image from url contains unicode string, but it also does not work:
url = "https://www.elections.on.ca/" urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)
The error I get is:
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx
HTTPError Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
2 #url = safeStr(url)
3 print(url)
----> 4 urllib.request.urlretrieve(url, file)
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
CodePudding user response:
I think this is a solution...
The problem is that the url you start with:
"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx'
is already url-quoted (e.g. spaces replaced by
), but still contains non-ascii chars here Orléans
So the solution from this question will help us, but just applying urllib.parse.quote(...)
results in twice-encoded spaces as %20
. That is why you get a 404 when requesting the processed url.
So first we need to unquote the url (i.e. ->> " "
), then quote it again - this time the accented char will be quoted too and it should work.
Try this:
path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca" path
The result we get is:
https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx
...should work now!