I am trying to write a program that gets all the links on webpage, even from sub-directories. I have this working with the requests package, but it is slow when you have to get links from a lot of sub-directories. Here is my working code that takes about 4 minutes to gather all the links from https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/.
import requests
import re
from bs4 import BeautifulSoup
def get_html(base_url):
req = requests.get(base_url)
return req.text if (req.status_code == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
def main(base_url):
files = []
html_page = get_html(base_url)
links = get_links(html_page)
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = main(sub)
files.append(sub_files)
return files
# Run programe
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = main(base_url)
I think the bottleneck in the code is the get_html()
function, it takes a few seconds to get back the html. I think this code can be optimized using async functions, but I am struggling to make this work. Here is my attempt at an async version of the code:
import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession() as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
async def get_tasks(session):
async with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
return await resp.text() if (resp.status == 200) else ''
async def main(base_url):
files = []
html_page = await asyncio.gather(get_html_async(base_url))
links = get_links(html_page[0])
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = await asyncio.gather(main(sub))
files.append(sub_files)
return files
# Run program
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = asyncio.gather(main(base_url))
Any help would be greatly appreciated. Thanks!
CodePudding user response:
By calling asyncio.gather()
the way you do, you are running your requests sequentially as before. asyncio.gather()
is taking multiple iterables as arguments to run them concurrently. There is no point in calling asyncio.gather()
with just one awaitable, since then you could simply await it. By creating all coros in main()
without awaiting them and then passing them all to asyncio.gather()
you get a significant speed-up:
# some minor fixes added
import asyncio
import re
from itertools import chain
import aiohttp
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False) # I got ssl errors on my machine
) as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ""
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r"(.nc$)|(/$)"
links = [
f"{base_url}{link.get('href')}"
for link in soup.findAll("a", attrs={"href": re.compile(regex)})
]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r"/$", link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r".nc$", link)]
return file_links
async def main(base_url):
files = []
html_page = await get_html_async(base_url)
links = get_links(html_page) # removed indexing 'html_page[0]'
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.extend(base_files) # extend list to get "cleaner" output, keep using 'append' if your downstream code requires it
coros = [main(sub) for sub in sub_dirs] # create all requests
new_files = await asyncio.gather(*coros) # run all requests concurrently
files.extend(chain(*new_files)) # again, add to list as needed
return files
# Run program
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"
files = asyncio.run(main(base_url)) # or simply 'await main(base_url)' in IPython
print(files)