I have a excel-sheet with a bunch of unique url's which will trigger a download of a file. I've automated the process using Python and pandas, but I'm struggling with finding a solution for when either:
1.the url path contains special characters 2. the filename contains special characters
df = pd.read_excel('MC-Redo.xlsx')
df_column = df.iloc[:,1]
def getVideo():
for value in df_column:
if "https://pinnacle.blob.core.windows.net/" in str(value) and " " not in str(value):
if value.find('/'):
fileName = value.rsplit('/', 1)[1]
if not os.path.exists(path fileName):
urllib.request.urlretrieve(value, fileName)
if "https://pinnacle.blob.core.windows.net/" in str(value) and " " in str(value):
newUrl = value.replace(' ', ' ')
if newUrl.find('/'):
fileName = newUrl.rsplit('/', 1)[1]
if not os.path.exists(path fileName):
try:
urllib.request.urlretrieve(newUrl, fileName)
except urllib.error.HTTPError as e:
if e.code != 200:
continue
I'm getting following error on some of the rows Im trying to iterate over and get the video by the url
Traceback (most recent call last):
File "c:\Users\aleb\Python EP download\getShortURLS.py", line 63, in <module>
getVideo()
File "c:\Users\aleb\Python EP download\getShortURLS.py", line 55, in getVideo
data = urllib.request.urlretrieve(newUrl, str(fileName))
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 239, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1346, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1285, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1296, in _send_request
self.putrequest(method, url, **skips)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1134, in putrequest
self._output(self._encode_request(request))
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1214, in _encode_request
return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode character '\xf8' in position 66: ordinal not in range(128)
I've tried using (with no luck):
urllib.request.urlretrieve(newUrl, fileName.encode('utf8'))
The special characters that may appear in both the URL and/or filename is 'æ', 'ø' and 'å'
If I'm doing a print before the
urllib.request.urlretrieve(newUrl, fileName)
I'm seeing filename:
"01-Første_innlogging.mp4"
and URL: https://pinnacle.blob.core.windows.net/client-files/{key}/{key}/01-Første innlogging.mp4
CodePudding user response:
Try
import urllib.parse
import urllib.request
df = pd.read_excel('MC-Redo.xlsx')
df_column = df.iloc[:,1]
def getVideo():
for value in df_column:
if "https://pinnacle.blob.core.windows.net/" in str(value) and value.find('/'):
fileName = value.rsplit('/', 1)[1]
if not os.path.exists(path fileName):
curr_link = value
parsed_link = urllib.parse.urlsplit(curr_link)
parsed_link = parsed_link._replace(path=urllib.parse.quote(parsed_link.path))
encoded_link = parsed_link.geturl()
urllib.request.urlretrieve(encoded_link, fileName)
from this answer.
The reason is urlretrieve needs an ascii link, so you need to encode the ø
as well. Encoded the link looks like:
https://pinnacle.blob.core.windows.net/client-files/{key}/{key}/01-Første innlogging.mp4
Maybe you have to also encode the filename, not sure, can't test because I don't have the actual link.