Geneal. Py
The import OS
# Each website is a separate project (folder)
Def create_project_dir (directory) :
If not OS. Path. The exists (directory) :
Print (' Creating the directory '+ directory)
OS. Makedirs (directory)
# the Create queue and crawled files (if not created)
Def create_data_files (project_name, base_iiqidfurl) :
The queue=OS. Path. Join (project_name, 'queue. TXT)
Crawled=OS. Path. Join (project_name, "crawled. TXT")
If not OS. Path. Isfile (queue) :
Write_file (queue, base_url)
If not OS. Path. Isfile (crawled) :
Write_file (crawled, ' ')
# the Create a new file
Def write_file (path, data) :
With the open (path, 'w') as f:
F.w rite (data)
# Add data onto an existing file
Def append_to_file (path, data) :
With the open (path, 'a') as file:
File. The write (data + '\ n')
# Delete the contents of a file
Def delete_file_contents (path) :
Open (path, 'w'). The close ()
# Read a file and convert the each line to set the items
Def file_to_set (file_name) :
Results=set ()
With the open (file_name, "rt") the as f:
For the line in f:
Results. The add (line. Replace (' \ n ', '))
Return the results
# Iterate through a set of each item will be a line in a file
Def set_to_file (links, file_name) :
With the open (" w ", file_name) as f:
For sorted in l (links) :
F.w rite (l + "\ n")
Domain. Py
The from urllib. Parse the import urlparse
# Get domain name (example.com)
Def get_domain_name (url) :
Try:
Results=get_sub_domain_name (url). The split ('. ')
Return the results of [2] + '+ results in [1]
Except:
Return '
# Get sub domain name (name.example.com)
Def get_sub_domain_name (url) :
Try:
Return urlparse (url). Netloc
Except:
Return '
Link_finder. Py
. The from HTML parser import HTMLParser
The from urllib import parse
The class LinkFinder (HTMLParser) :
Def __init__ (self, base_url, page_url) :
Super () __init__ ()
The self. The base_url=base_url
Self. Page_url=page_url
The self. The links=set ()
# the When we call HTMLParser feed () this function is called When it encounters an opening tag & lt; a>
Def handle_starttag (self, tag, attrs) :
If the tag=='a' :
For (attribute, value) in attrs:
If the attribute=='href' :
Url=the parse. Urljoin (self base_url, value)
The self. The links. The add (url)
Def page_links (self) :
Return the self. The links
Def error (self, the message) :
Pass
The main. Py
The import threading
From the queue the import queue
The from spiders import spiders
The from domain import *
From the general import *
PROJECT_NAME='9 - seo'
HOMEPAGE='http://viper-seo.com/'
DOMAIN_NAME=get_domain_name (HOMEPAGE)
QUEUE_FILE=PROJECT_NAME + '/queue. TXT'
CRAWLED_FILE=PROJECT_NAME + '/crawled. TXT'
NUMBER_OF_THREADS=8
The queue=queue ()
Spiders (PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)
# the Create worker threads (will die when the main exits)
Def create_workers () :
For _ in range (NUMBER_OF_THREADS) :
T=threading. Thread (target=work)
T.d aemon=True
T.s tart ()
# Do the next job in the queue
Def the work () :
While True:
Url=queue. The get ()
Spiders. Crawl_page (threading. Current_thread (). The name, url)
Queue. Task_done ()
# Each queued link is a new job
Def create_jobs () :
For the link in file_to_set (QUEUE_FILE) :
The queue. The put (link)
The queue. The join ()
Crawl ()
# Check if there are items in the queue, if so crawl them
Def crawl () :
Queued_links=file_to_set (QUEUE_FILE)
If len (queued_links) & gt; 0:
Print (STR (len (queued_links)) + 'links in the queue')
Create_jobs ()
Create_workers ()
Crawl ()
Spiders. Py
The from urllib. Request the import urlopen
The from link_finder import LinkFinder
The from domain import *
From the general import *
The class spiders:
Project_name='
Base_url='
Domain_name='
Queue_file='
Crawled_file='
The queue=set ()
Crawled=set ()
Def __init__ (self, project_name, base_url domain_name) :
Spiders. Project_name=project_name
Spiders. Base_url=base_url
Spiders. Domain_name=domain_name
Spiders. Queue_file=spiders. Project_name + '/queue. TXT'
Spiders. Crawled_file=spiders. Project_name + '/crawled. TXT'
The self. The boot ()
Self. Crawl_page (' First spiders, spiders base_url)
# Creates the directory and files for the project on the first run and starts the spiders
@ staticmethod
Def boot () :
Create_project_dir (spiders. Project_name)
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull