For a great god, and Python as a web crawler met with some difficulties-CodePudding

I do in the crawler is python, recently found an example from the Internet, there is nothing with modified repeatedly several times, just learning python, want to consult everybody, what's the problem, this is a project of five files,
Geneal. Py

 
The import OS 


# Each website is a separate project (folder) 
Def create_project_dir (directory) : 
If not OS. Path. The exists (directory) : 
Print (' Creating the directory '+ directory) 
OS. Makedirs (directory) 


# the Create queue and crawled files (if not created) 
Def create_data_files (project_name, base_iiqidfurl) : 
The queue=OS. Path. Join (project_name, 'queue. TXT) 
Crawled=OS. Path. Join (project_name, "crawled. TXT") 
If not OS. Path. Isfile (queue) : 
Write_file (queue, base_url) 
If not OS. Path. Isfile (crawled) : 
Write_file (crawled, ' ') 


# the Create a new file 
Def write_file (path, data) : 
With the open (path, 'w') as f: 
F.w rite (data) 


# Add data onto an existing file 
Def append_to_file (path, data) : 
With the open (path, 'a') as file: 
File. The write (data + '\ n') 


# Delete the contents of a file 
Def delete_file_contents (path) : 
Open (path, 'w'). The close () 


# Read a file and convert the each line to set the items 
Def file_to_set (file_name) : 
Results=set () 
With the open (file_name, "rt") the as f: 
For the line in f: 
Results. The add (line. Replace (' \ n ', ')) 
Return the results 


# Iterate through a set of each item will be a line in a file 
Def set_to_file (links, file_name) : 
With the open (" w ", file_name) as f: 
For sorted in l (links) : 
F.w rite (l + "\ n")

Domain. Py

 
The from urllib. Parse the import urlparse 


# Get domain name (example.com) 
Def get_domain_name (url) : 
Try: 
Results=get_sub_domain_name (url). The split ('. ') 
Return the results of [2] + '+ results in [1] 
Except: 
Return '


# Get sub domain name (name.example.com) 
Def get_sub_domain_name (url) : 
Try: 
Return urlparse (url). Netloc 
Except: 
Return '

Link_finder. Py

 
. The from HTML parser import HTMLParser 
The from urllib import parse 


The class LinkFinder (HTMLParser) : 

Def __init__ (self, base_url, page_url) : 
Super () __init__ () 
The self. The base_url=base_url 
Self. Page_url=page_url 
The self. The links=set () 

# the When we call HTMLParser feed () this function is called When it encounters an opening tag & lt; a> 
Def handle_starttag (self, tag, attrs) : 
If the tag=='a' : 
For (attribute, value) in attrs: 
If the attribute=='href' : 
Url=the parse. Urljoin (self base_url, value) 
The self. The links. The add (url) 

Def page_links (self) : 
Return the self. The links 

Def error (self, the message) : 
Pass

The main. Py

 
The import threading 
From the queue the import queue 
The from spiders import spiders 
The from domain import * 
From the general import * 

PROJECT_NAME='9 - seo' 
HOMEPAGE='http://viper-seo.com/' 
DOMAIN_NAME=get_domain_name (HOMEPAGE) 
QUEUE_FILE=PROJECT_NAME + '/queue. TXT' 
CRAWLED_FILE=PROJECT_NAME + '/crawled. TXT' 
NUMBER_OF_THREADS=8 
The queue=queue () 
Spiders (PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) 


# the Create worker threads (will die when the main exits) 
Def create_workers () : 
For _ in range (NUMBER_OF_THREADS) : 
T=threading. Thread (target=work) 
T.d aemon=True 
T.s tart () 


# Do the next job in the queue 
Def the work () : 
While True: 
Url=queue. The get () 
Spiders. Crawl_page (threading. Current_thread (). The name, url) 
Queue. Task_done () 


# Each queued link is a new job 
Def create_jobs () : 
For the link in file_to_set (QUEUE_FILE) : 
The queue. The put (link) 
The queue. The join () 
Crawl () 


# Check if there are items in the queue, if so crawl them 
Def crawl () : 
Queued_links=file_to_set (QUEUE_FILE) 
If len (queued_links) & gt; 0: 
Print (STR (len (queued_links)) + 'links in the queue') 
Create_jobs () 


Create_workers () 
Crawl ()

Spiders. Py

 
The from urllib. Request the import urlopen 
The from link_finder import LinkFinder 
The from domain import * 
From the general import * 


The class spiders: 

Project_name='
Base_url='
Domain_name='
Queue_file='
Crawled_file='
The queue=set () 
Crawled=set () 

Def __init__ (self, project_name, base_url domain_name) : 
Spiders. Project_name=project_name 
Spiders. Base_url=base_url 
Spiders. Domain_name=domain_name 
Spiders. Queue_file=spiders. Project_name + '/queue. TXT' 
Spiders. Crawled_file=spiders. Project_name + '/crawled. TXT' 
The self. The boot () 
Self. Crawl_page (' First spiders, spiders base_url) 

# Creates the directory and files for the project on the first run and starts the spiders 
@ staticmethod 
Def boot () : 
Create_project_dir (spiders. Project_name) 
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull