I am using the following script to extract internal and external links from web pages. I then another script to process those links according to my needs.
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme "://" parsed_href.netloc parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(url, max_urls=1000):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited = 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
parser.add_argument("url", help="The URL to extract links from.")
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
args = parser.parse_args()
url = args.url
max_urls = args.max_urls
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
crawl(url, max_urls=max_urls)
# save the internal links to a file
with open(f"{domain_name}_internal_links.txt", "w") as f:
for internal_link in internal_urls:
print(internal_link.strip(), file=f)
# save the external links to a file
with open(f"{domain_name}_external_links.txt", "w") as f:
for external_link in external_urls:
print(external_link.strip(), file=f)
I have tested the script on numerous websites and it works fine. But after some time I get an error (many different errors) and the script stops.
For example:
requests.exceptions.InvalidSchema: No connection adapters were found for
Is there an error-agnostic way to keep the script running?
CodePudding user response:
requests.exceptions.InvalidSchema: No connection adapters were found for
Based on error info, you need to include the protocol schema, cause without the http://
or https://
part, requests is not able to connect to the remote server.
Try to print your requestet url in case of an error, to get specific information.
In newer code avoid old syntax findAll()
instead use find_all()
or select()
with css selectors
- For more take a minute to check docs