Crawlers
# - * - coding: utf-8 - * -
The import scrapy
The from iqiyi. Items import IqiyiItem
The from scrapy. The selector import the selector
The from scrapy. Contrib. Spiders import CrawlSpider, Rule
The from scrapy. Contrib. Linkextractors. SGML import SgmlLinkExtractor
The class IqiyiSpider (CrawlSpider) :
Name="iqiyi
"Allowed_domains=(" list.iqiyi.com ")
# download_delay=1
Start_urls=[" http://list.iqiyi.com/www/2/-------------11-1-1-iqiyi --. HTML "]
Rules=[
# TV series
Rule (SgmlLinkExtractor (allow=(' * http://list.iqiyi.com/www/2/-----------. - 4 - \ d + 1 - iqiyi -. HTML ')), the callback="parse_item", follow=True)
]
Def parse_item (self, response) :
Sel=the Selector (response)
The item=IqiyiItem ()
# film name
Item [' name ']=sel. Xpath ('///@/p a/text () '). The extract ()
# item [' name ']=[n.e ncode (' utf8) for n in name]
# film type
Item [' classification ']=sel. Xpath ('/HTML/body/div [4]/div/div/div [1]/div [1]/ul/li/@/a/text () '). The extract ()
# albumid
Item [' albumid ']=sel. Xpath ('//a/@ data - qidanadd - albumid '). The extract ()
# tvid
Item [' tvid]=sel. Xpath ('//a/@ data - qidanadd - tvid '). The extract ()
Return the item
Pipelines
# - * - coding: utf-8 - * -
The from scrapy import log
The from the twisted. Enterprise import adbapi # import twisted package
The import MySQLdb
The import MySQLdb. Your cursors
Import a datetime
The class IqiyiPipeline (object) :
Def __init__ (self) : # initialization to connect the mysql database information
Self. Dbpool=adbapi. ConnectionPool (
DbapiName='MySQLdb',
The host='127.0.0.1,
The db='iqiyi,
User='root',
Passwd='root',
Cursorclass=MySQLdb. Your cursors. DictCursor,
Charset='utf8,
Use_unicode=False
)
# pipeline dafault function # this function is the function called pipeline default
Def process_item (self, item, spiders) :
Query=self. Dbpool. RunInteraction (self) _conditional_insert, item)
Return the item
# # insert the data to the databases inserted the data into the database
Def _conditional_insert (self, tx, item) :
Lenname=len (item [' name '])
The item [' classification '] *=lenname
In the range for n (lenname) :
Tx. The execute (" select * from tv1 where name=% s ", (item [' name '] [n]))
Result=tx. Fetchone ()
If the result:
Pass
The else:
SQL="insert into tv1 (name, classification, albumid tvid) values (% s, % s, % s, % s)"
Tx. The execute (SQL, (item [' name '] [n], item [' classification '] [n], item [' albumid '] [n], item [' tvid] [n]))
CodePudding user response:
Dude, do you have the answer, I climbed in electricity data, also appeared a lot of repeated data, and apparently no electricity all data to climb down, solving ah,CodePudding user response:
May be connected to the mysql module has a problem, there is a problem query python torndb abnormal occasionally it is not an error is a normal return an empty, so I think you'd better add a unique constraint to the mysql or switch to other connection modules such as sqlalchemy is heavyCodePudding user response:
To solve, I also met the same problem,CodePudding user response:
To solve, I also met the same problem, solved? An answer here, but did not understand at https://segmentfault.com/q/1010000003070627/a-1020000005151057CodePudding user response:
I also encountered this problem, the solution is to switch to a regular expression parsing, don't use xpath,I don't know why, master
CodePudding user response: