The import of the from re *
HTTP=PoolManager ()
Disable_warnings ()
# download HTML file
Def the download (url) :
Result=HTTP request (' GET ', url)
# will download HTML file code with utf-8 format decoding into string
HtmlStr=result. The data. The decode (' utf-8)
Return htmlStr
# analysis HTMl file
Def analyse (htmlStr) :
# the regular expression is used to collect all the a label, such as & lt; A href="https://bbs.csdn.net/topics/a.html" & gt; First page
AList=the.findall (' & lt; A [^ & gt;] * & gt; ', htmlStr)
Result=[]
# to iterate a tag list
For a in aList:
# use regular expressions from a label extract the value of the href attribute, such as & lt; A href='https://bbs.csdn.net/topics/a.html' & gt; a
G=the search (' href=[\ s] * [\ s] * [\ '"] ([^ & gt; [\' \ '" "] *) "]', a)
If g!=None:
# for a label the value of the href attribute, the href attribute value is the value of the first group
Url=g.g roup (1)
# url into absolute links
Url='http://localhost:8888/files/' + url
# will extract the appended to the url of the result list
Result. Append (url)
Return the result
# is used to grab the HTML file from the entry point function,
Def crawler (url) :
# output are urls fetched
Print (url)
# download HTML file
HTML=download (url)
HTML code # analysis
Urls=analyse (HTML)
# for each url recursive call crawler function
For the url in urls:
Crawler (url)
# starting from the entry point to the url to grab all the HTML files
Crawler (' http://localhost:8888/files')
The contents of the HTML file is very simple, such as:
<body>