import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
url='https://www.go100.com.tw/exam_download_3.php'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup)
all_as = soup.find_all('a')
for index, a_tag in enumerate(all_as):
if 'pdf' in a_tag['href']:
#print(a_tag['href'])
urlretrieve(a_tag['href'], 'file_tmp.pdf')
break
It shows ValueError, and I can't find what's wrong.here is the result
CodePudding user response:
You've already done 90% of the work. You must use urljoin
from urllib.parse
:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url='https://www.go100.com.tw/exam_download_3.php'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup)
all_as = soup.find_all('a')
for index, a_tag in enumerate(all_as):
if 'pdf' in a_tag['href']:
print(a_tag['href'])
print(urljoin(url, a_tag['href']))
response = requests.get(urljoin(url, a_tag['href']))
open("file_tmp.pdf", "wb").write(response.content)
break
CodePudding user response:
You can download all those files using requests only:
import requests
from bs4 import BeautifulSoup
import re
url='https://www.go100.com.tw/exam_download_3.php'
s = requests.Session()
correct_links = []
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links = [a.get('href') for a in soup.select('a') if '.pdf' in a.get('href')]
for link in links:
if 'https://' not in link:
link = 'https://www.go100.com.tw' link
correct_links.append(link)
for link in list(set(correct_links)):
r = s.get(link)
with open(f"{re.sub(r'[^a-zA-Z0-9]', '', link)}.pdf", "wb") as f:
f.write(r.content)
print(f"saved {re.sub(r'[^a-zA-Z0-9]', '', link)}")
This will save all the downloadable pdfs in the same folder where you are running the script from, with relevant names. Requests documentation: https://requests.readthedocs.io/en/latest/