is it possible to pass the √
through this untouched or am i asking too much
import urllib.request
path = 'html'
links = 'links'
with open(links, 'r', encoding='UTF-8') as links:
for link in links: #for each link in the file
print(link)
with urllib.request.urlopen(link) as linker: #get the html
print(linker)
with open(path, 'ab') as f: #append the html to html
f.write(linker.read())
links
https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A
output
File "PYdown.py", line 7, in <module>
with urllib.request.urlopen(link) as linker:
File "/usr/lib64/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/usr/lib64/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/usr/lib64/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib64/python3.6/urllib/request.py", line 1392, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib64/python3.6/urllib/request.py", line 1349, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/usr/lib64/python3.6/http/client.py", line 1254, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib64/python3.6/http/client.py", line 1265, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib64/python3.6/http/client.py", line 1132, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u221a' in position 29: ordinal not in range(128)
CodePudding user response:
instead of getting python to read √
as itself, I would have to translate the √
to √
in order to get python to output √
credit @Olvin Roght
CodePudding user response:
You need to quote Unicode chars in URL. You have file which contains list of urls you need to open, so you need to split each url (using urllib.parse.urlsplit()
), quote (with urllib.parse.quote()
) host and every part of path (to split paths you can use pathlib.PurePosixPath.parts
) and then form URL back (using urllib.parse.urlunsplit()
).
from pathlib import PurePosixPath
from urllib.parse import urlsplit, urlunsplit, quote, urlencode, parse_qsl
def normalize_url(url):
splitted = urlsplit(url) # split link
path = PurePosixPath(splitted.path) # initialize path
parts = iter(path.parts) # first element always "/"
quoted_path = PurePosixPath(next(parts)) # "/"
for part in parts:
quoted_path /= quote(part) # quote each part
return urlunsplit((
splitted.scheme,
splitted.netloc.encode("idna").decode(), # idna
str(quoted_path),
urlencode(parse_qsl(splitted.query)), # force encode query
splitted.fragment
))
Usage:
links = (
"https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A",
"https://stackoverflow.com/",
"https://www.google.com/search?q=√2&client=firefox-b-d",
"http://pfarmerü.com/"
)
print(*(normalize_url(link) for link in links), sep="\n")
Output:
https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A
https://stackoverflow.com/
https://www.google.com/search?q=√2&client=firefox-b-d,
http://xn--pfarmer-t2a.com/
You can help my country, check my profile info.