How to deal with the 'ascii' codec can't encode character '\xe9' error?-CodePudding

I am trying to download Excel files from the website. My code below:

import os
import requests
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin
import urllib

headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get("https://www.elections.on.ca/en/resource-centre/elections-results.html#accordion2022ge")
soup = BeautifulSoup(resp.text,"html.parser")

for link in soup.find_all('a', href=True):
#    print(link)
    if 'xlsx' in link['href']:
        print(link['href'])
        url="https://www.elections.on.ca/" link['href']
#        print(url)
        file= url.split("/")[-1].split(".")[0] ".xlsx"
#        print(file)
        urllib.request.urlretrieve(url, file)

However, I get the following error when https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - OrlÃ©ans 076.xlsx is trying to be opened

UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-9-e1694f5ee458> in <module>
      8         file= url.split("/")[-1].split(".")[0] ".xlsx"
      9 #        print(file)
---> 10         urllib.request.urlretrieve(url, file)
...

UnicodeEncodeError: 'ascii' codec can't encode characters in position 101-102: ordinal not in range(128).

EDIT: I tried the safeStr solution form UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128), but it does not work. Please see below:

def safeStr(obj):
    try: return str(obj).encode('ascii', 'ignore').decode('ascii')
    except: return ""

url="https://www.elections.on.ca/" '/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx'
#        print(url)
print(url)
file= url.split("/")[-1].split(".")[0] ".xlsx"
url = safeStr(url)
print(url)
#        print(file)
urllib.request.urlretrieve(url, file)

The error I get is:

https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx
https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orlans 076.xlsx

HTTPError                                 Traceback (most recent call last)
<ipython-input-33-01070419a054> in <module>
      6 print(url)
      7 #        print(file)
----> 8 urllib.request.urlretrieve(url, file)

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default')   orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found

I tried another solution from problem of urlretrieve cannot get image from url contains unicode string, but it also does not work:

url = "https://www.elections.on.ca/" urllib.parse.quote('/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx')
#url = safeStr(url)
print(url)
urllib.request.urlretrieve(url, file)

The error I get is:

https://www.elections.on.ca//content/dam/NGW/sitecontent/2022/results/Vote%20Totals%20From%20Official%20Tabulation%20-%20Orléans%20076.xlsx

HTTPError                                 Traceback (most recent call last)
<ipython-input-56-cfce9d1344d0> in <module>
      2 #url = safeStr(url)
      3 print(url)
----> 4 urllib.request.urlretrieve(url, file)

~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    245     url_type, path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url, data)) as fp:
    248         headers = fp.info()
    249 

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default')   orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found

CodePudding user response：

I think this is a solution...

The problem is that the url you start with:

"https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx'

is already url-quoted (e.g. spaces replaced by ), but still contains non-ascii chars here Orléans

So the solution from this question will help us, but just applying urllib.parse.quote(...) results in twice-encoded spaces as %20. That is why you get a 404 when requesting the processed url.

So first we need to unquote the url (i.e. ->> " "), then quote it again - this time the accented char will be quoted too and it should work.

Try this:

path = urllib.parse.quote(urllib.parse.unquote(link['href']))
url = "https://www.elections.on.ca"   path

The result we get is:

https://www.elections.on.ca/content/dam/NGW/sitecontent/2022/results/Vote Totals From Official Tabulation - Orléans 076.xlsx

...should work now!