Home > other >  For a great god, and Python as a web crawler met with some difficulties
For a great god, and Python as a web crawler met with some difficulties

Time:09-19

I do in the crawler is python, recently found an example from the Internet, there is nothing with modified repeatedly several times, just learning python, want to consult everybody, what's the problem, this is a project of five files,
Geneal. Py
 
The import OS


# Each website is a separate project (folder)
Def create_project_dir (directory) :
If not OS. Path. The exists (directory) :
Print (' Creating the directory '+ directory)
OS. Makedirs (directory)


# the Create queue and crawled files (if not created)
Def create_data_files (project_name, base_iiqidfurl) :
The queue=OS. Path. Join (project_name, 'queue. TXT)
Crawled=OS. Path. Join (project_name, "crawled. TXT")
If not OS. Path. Isfile (queue) :
Write_file (queue, base_url)
If not OS. Path. Isfile (crawled) :
Write_file (crawled, ' ')


# the Create a new file
Def write_file (path, data) :
With the open (path, 'w') as f:
F.w rite (data)


# Add data onto an existing file
Def append_to_file (path, data) :
With the open (path, 'a') as file:
File. The write (data + '\ n')


# Delete the contents of a file
Def delete_file_contents (path) :
Open (path, 'w'). The close ()


# Read a file and convert the each line to set the items
Def file_to_set (file_name) :
Results=set ()
With the open (file_name, "rt") the as f:
For the line in f:
Results. The add (line. Replace (' \ n ', '))
Return the results


# Iterate through a set of each item will be a line in a file
Def set_to_file (links, file_name) :
With the open (" w ", file_name) as f:
For sorted in l (links) :
F.w rite (l + "\ n")


Domain. Py
 
The from urllib. Parse the import urlparse


# Get domain name (example.com)
Def get_domain_name (url) :
Try:
Results=get_sub_domain_name (url). The split ('. ')
Return the results of [2] + '+ results in [1]
Except:
Return '


# Get sub domain name (name.example.com)
Def get_sub_domain_name (url) :
Try:
Return urlparse (url). Netloc
Except:
Return '


Link_finder. Py
 
. The from HTML parser import HTMLParser
The from urllib import parse


The class LinkFinder (HTMLParser) :

Def __init__ (self, base_url, page_url) :
Super () __init__ ()
The self. The base_url=base_url
Self. Page_url=page_url
The self. The links=set ()

# the When we call HTMLParser feed () this function is called When it encounters an opening tag & lt; a>
Def handle_starttag (self, tag, attrs) :
If the tag=='a' :
For (attribute, value) in attrs:
If the attribute=='href' :
Url=the parse. Urljoin (self base_url, value)
The self. The links. The add (url)

Def page_links (self) :
Return the self. The links

Def error (self, the message) :
Pass


The main. Py
 
The import threading
From the queue the import queue
The from spiders import spiders
The from domain import *
From the general import *

PROJECT_NAME='9 - seo'
HOMEPAGE='http://viper-seo.com/'
DOMAIN_NAME=get_domain_name (HOMEPAGE)
QUEUE_FILE=PROJECT_NAME + '/queue. TXT'
CRAWLED_FILE=PROJECT_NAME + '/crawled. TXT'
NUMBER_OF_THREADS=8
The queue=queue ()
Spiders (PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


# the Create worker threads (will die when the main exits)
Def create_workers () :
For _ in range (NUMBER_OF_THREADS) :
T=threading. Thread (target=work)
T.d aemon=True
T.s tart ()


# Do the next job in the queue
Def the work () :
While True:
Url=queue. The get ()
Spiders. Crawl_page (threading. Current_thread (). The name, url)
Queue. Task_done ()


# Each queued link is a new job
Def create_jobs () :
For the link in file_to_set (QUEUE_FILE) :
The queue. The put (link)
The queue. The join ()
Crawl ()


# Check if there are items in the queue, if so crawl them
Def crawl () :
Queued_links=file_to_set (QUEUE_FILE)
If len (queued_links) & gt; 0:
Print (STR (len (queued_links)) + 'links in the queue')
Create_jobs ()


Create_workers ()
Crawl ()


Spiders. Py
 
The from urllib. Request the import urlopen
The from link_finder import LinkFinder
The from domain import *
From the general import *


The class spiders:

Project_name='
Base_url='
Domain_name='
Queue_file='
Crawled_file='
The queue=set ()
Crawled=set ()

Def __init__ (self, project_name, base_url domain_name) :
Spiders. Project_name=project_name
Spiders. Base_url=base_url
Spiders. Domain_name=domain_name
Spiders. Queue_file=spiders. Project_name + '/queue. TXT'
Spiders. Crawled_file=spiders. Project_name + '/crawled. TXT'
The self. The boot ()
Self. Crawl_page (' First spiders, spiders base_url)

# Creates the directory and files for the project on the first run and starts the spiders
@ staticmethod
Def boot () :
Create_project_dir (spiders. Project_name)
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
  • Related