The import requests
The import re
Def getHtmlText (url) :
Try:
Head_new={
'authority' : 'me-too1980.taobao.com',
'method' : 'GET',
'path' : '/I/asynSearch. HTM? _ksTS=1606466671243 _136 & amp; The callback=jsonp137 & amp; Mid - 22507069265-0=w & amp; Wid=22507069265 & amp; Path=/search. Htm& Search=y& SPM=a1z10. 1 - c - s. hundreds ba78a4bjfYUkQ ',
'scheme' : 'HTTPS',
'accept' : 'text/javascript, application/javascript, application/ecmascript, application/x - ecmascript, */*; Q=0.01 ',
'the accept - encoding' : 'gzip, deflate, br',
'the accept - language' : 'useful - CN, useful; Q=0.9, en. Q=0.8, en - GB; Q=0.7, en - US; Q=0.6 ',
'cookies' :' cna=L3qgF0z1fzYCAXOudfa9Bs9A; T=b0ff490425c6e9789cd7b1f4355d2c82; Sgcookie=E100Vt88QJkcmkvNRBieAvzEL8KntYgf5606%2 b7mjneqtizobxip % 2 bbn4j6ostugmsg1cojdzej '
'aA4Rv7wBC5jfaY96g % 3 d % 3 d; Uc3=nk2=D9rlm5djaUbfsA % 3 d % 3 d & amp; Vt3=F8dCufwnjBZ3KgJvp3g % 3 d & amp; Lg2=VT5L2FSpMGV7TQ % 3 d % 3 d & amp; Id2=VyyX76rnQ % 2 ba % 3 d; LGC=lyw2073327; Uc4=nk4=0 '
'% 40 dfmlap2linszldoyhrpgmvznlbte & amp; 40 vxtygyhn5no6ybkaaoxxkcpnha id4=0% % 3 d % 3 d; Tracknick=lyw2073327; _cc_=UIHiLt3xSw % 3 d % 3 d; Enc=Ngu0DnS8%2 bphirgxmjwhtbd % 2 bo2wk3 '
'AZii4jNaSwBjhCYDzDbDv5DZLVqszwhIrMtUB7%2 fzooung82lnpoimqicka % 3 d % 3 d; Mt=ci=82 _1; THW=cn; HNG 7 CZH - CN=CN % % 7 ccny % 7 c156; Xlly_s=1; _samesite_flag_=true; '
'cookie2=19458 b4b63d3c65bcb564ced611db1b6; _tb_token_=ee559db67bee1; V=0; _m_h5_tk=6 b865df9243e0245a83a1020ffde836c_1606472572267; '
'_m_h5_tk_enc=c78a604959ad955b3cab2c8ba50b5e5d; Ucs 1=cookie14=Uoe0azJUmbQGew % 3 d % 3 d; 23 e1hvnqvuvbpvuvckvvvvvjiwp2dp0jlbrssptjyhpmpozjywp2ly6jd8rl pnm_cku822=098% '
'MZAj18RvhvCvvvvvvRvpvhvv2MMg9CvvOUvvVvJh % 2 fivpvuvvmvr6kipjtgvpvivvvvvhcvvvvvvvjlphvuoqvvvqcvpvacvvv2vhcv2rvvvvwvphvwgv9cvhqvpewvcluqd7zhvutkjrcni4modvqefwclyb'
'8 rjm7g % 2 bx7t % 2 bsiicexrzj7jrayvyo2v % 2 bb8raof6d7zvd3odn % 2 bclyw9xv7qefacly80kvvhvc9vhvvcvp8ocvvpvvumm; TFSTK=cbcRBvG77nxlUUax7YpcdaSUpH8cZNQ8xaZd9FlOe7G1Bo5dinhi6 '
'4 udfyzpqec.. ; L=eBaxkpAPOSzOUC - XBOfanurza77OSIRYouPzaNbMiOCP9Z1p502hWZRYJn89C3GVhsMDR3rEk3ObBeYBqIfXNmyn1xaaZ7Dmn; The isg=BFJSCPliWS0EuKUgSuu1ZWRgoxg0Y1b9mzmYjxyr '
'foXwL_IpBPOmDVhJn4sTWc6V'
'referer' : 'https://me-too1980.taobao.com/search.htm? SPM=a1z10. 1 - c - s. hundreds ba78a4bjfYUkQ & amp; Search,
=y ''the SEC - fetch - dest' : 'empty'
'the SEC - fetch - mode:' cors,
'the SEC - fetch - site' : 'the same - origin,
'the user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64. X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47 ',
'x - requested - with' : 'the XMLHttpRequest'
}
R=requests. Get (url, headers=head_new)
R.r aise_for_status ()
R.e ncoding=of state Richard armitage pparent_encoding
Return r.t ext
Except:
Print (" crawl failure ")
Return ""
Def parsePage (ilist, HTML) :
Try:
Goods_name=re. The.findall (r '& lt; Img Alt=". *?" ', HTML)
Goods_price=re. The.findall (r 'c - price & gt;" \ d + \ \ d * & lt; ', HTML)
Goods_sale_count=re. The.findall (r 'sale - num "& gt; \ d + & lt; ', HTML)
For I in range (len (goods_name) :
Price=eval (re. The split (r '[& gt; |], goods_price [I]) [1])
Sale_count=eval (re. The split (r '[& gt; | & lt;] ', goods_sale_count [I]) [1])
Name=goods_name [I]. Split (' \ ') "[1]
Ilist. Append ([name, price, sale_count])
Except:
Print (" parse error ")
Def printGoodsList (ilist) :
Print ("=====================================================================================================")
TPLT="{0: & lt; {3} \ t 1: & lt; {70} \ t 2: & lt; 6} \ t {3: & lt; 6}
"Print (TPLT. The format (" serial number ", "product name", "price", "sales"))
The count=0
For g in ilist:
Count +=1
Print (TPLT. The format ([0] count, g, g [1], [2] g, g [3]))
Print ("=====================================================================================================")
Def the main () :
The depth=2 # Numbers here to search what page, this is a range, rather than only corresponds to the page, such as 2 means search 1-2 pages of data
Start_url="HTTP://https://me-too1980.taobao.com/i/asynSearch.htm? _ksTS="
InfoList=[]
For I in range (the depth) :
Try:
Url=start_url + '& amp;='+ pageNo STR (1 + I)
HTML=getHtmlText (url)
ParsePage (infoList, HTML)
Except:
The continue
PrintGoodsList (infoList)
main()
CodePudding user response:
Taobao's official store tools, it is not necessary to work to get this