I'm trying to download the links on the following page: https://bdif.amf-france.org/fr?typesInformation=DD
More precisely, the "télécharger" buttons. These links are updated daily and I want to take only the new ones that have been updated. I've been trying via BeautifulSoup but it doesn't seem to work. The first step for me would be finding the list of links available (so that I can determine which ones are new). I've tried using the following method but it doesn't seem to give me any links. Does anyone know?
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("https://bdif.amf-france.org")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
print(link['href'])
Let me know if you have any questions. Thanks !
CodePudding user response:
Here's how to download first 20 pdf's.
import requests
from shutil import copyfileobj
endpoint = "https://bdif.amf-france.org/back/api/v1/informations?from=0&size=20"
base_api_url = "https://bdif.amf-france.org/back/api/v1/documents"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0",
}
with requests.Session() as s:
response = s.get(endpoint, headers=headers).json()
file_sources = [
[
f"{base_api_url}/{item['_source']['documents'][0]['path']}", # Document
item["_source"]["documents"][0]["nomFichier"] # File name
]
for item in response["hits"]["hits"]
]
for file in file_sources:
url, name = file
with s.get(url, stream=True) as pdf, open(name, "wb") as output:
copyfileobj(pdf.raw, output)
Output (20 pdf files in the directory you run the script from):
DD_22_825379_8943674.pdf
DD_22_825400_8943881.pdf
DD_22_825401_8943893.pdf
DD_22_825402_8943905.pdf
DD_22_825403_8943917.pdf
DD_22_825404_8943929.pdf
DD_22_825405_8943941.pdf
DD_22_825406_8943953.pdf
DD_22_825407_8943965.pdf
DD_22_825459_8944700.pdf
DD_22_825460_8944725.pdf
DD_22_825462_8944752.pdf
DD_22_825463_8944773.pdf
DD_22_825465_8944800.pdf
DD_22_825483_8944932.pdf
DD_22_825534_8945530.pdf
DD_22_825571_8945914.pdf
DD_22_825573_8945934.pdf
DD_22_825574_8945948.pdf
DD_22_825577_8945972.pdf
To get more just change the size
value in the endpoint. You can also filter the pdfs by date using this key dateCreation
that sits in _source
.
CodePudding user response:
To just get the links I did it with Selenium:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options as ChromeOptions
import chromedriver_autoinstaller
from selenium.webdriver.common.by import By
import pprint
chromedriver_autoinstaller.install()
options = ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.get("https://bdif.amf-france.org/fr")
driver.implicitly_wait(0.5)
lnks=driver.find_elements_by_tag_name("a")
# traverse list
list_of_links =[]
for lnk in lnks:
if( type(lnk.get_attribute("href"))== str) and (lnk.get_attribute("href").__contains__("https://bdif.amf-france.org/fr/details")):
list_of_links.append(lnk.get_attribute("href"))
pprint.pprint(list_of_links)
then the output looks like this:
['https://bdif.amf-france.org/fr/details/2022DD825577',
'https://bdif.amf-france.org/fr/details/2022DD825574',
'https://bdif.amf-france.org/fr/details/2022DD825573',
'https://bdif.amf-france.org/fr/details/2022DD825571',
'https://bdif.amf-france.org/fr/details/2022DD825534',
'https://bdif.amf-france.org/fr/details/2022DD825483',
'https://bdif.amf-france.org/fr/details/2022DD825465',
'https://bdif.amf-france.org/fr/details/2022DD825463',
'https://bdif.amf-france.org/fr/details/2022DD825462',
'https://bdif.amf-france.org/fr/details/2022DD825460',
'https://bdif.amf-france.org/fr/details/2022DD825459',
'https://bdif.amf-france.org/fr/details/2022DD825407',
'https://bdif.amf-france.org/fr/details/2022DD825406',
'https://bdif.amf-france.org/fr/details/2022DD825405',
'https://bdif.amf-france.org/fr/details/2022DD825404',
'https://bdif.amf-france.org/fr/details/2022DD825403',
'https://bdif.amf-france.org/fr/details/2022DD825402',
'https://bdif.amf-france.org/fr/details/2022DD825401',
'https://bdif.amf-france.org/fr/details/2022DD825400',
'https://bdif.amf-france.org/fr/details/2022DD825379']