Home > other >  Python crawler, printed after the dictionary data into a json data, output data display only the las
Python crawler, printed after the dictionary data into a json data, output data display only the las

Time:10-12

# coding=utf-8
The import requests
The from bs4 import BeautifulSoup
The import json

The class zufangSpider:
Def __init__ (self, anjuke_name) :
Self. Anjuke_name=anjuke_name
='https://nj.lianjia.com/' self. Url_temp + anjuke_name + '/pg {}
The self. The headers={
'the user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64. X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 '
}

Def get_url_list (self) : # construct url list
Url_list=[]
For I in range (1000) :
Url_list. Append (self) url_temp) format (I))
Return url_list

Def parse_url (self, url) : # send requests, get response
Print (url)
Wb_data=https://bbs.csdn.net/topics/requests.get (url, headers=self. The headers)
Soup=BeautifulSoup (wb_data. Text, 'LXML')
Prices=soup. Select (' div & gt; Span ')
Titles=soup. Select (' p.c ontent__list - item -- title. Twoline & gt; A ')
Rooms=soup. Select (' p.c ontent__list - item - des')
Imgs=soup. Select (' a & gt; Img ')
Cates=soup. Select (' p.c ontent__list - item - bottom. The oneline ')
For the title, price, room, img, cate in zip (titles, prices, rooms, imgs, cates) :
Data={
https://bbs.csdn.net/topics/'title: the title get_text (),
'price: price. Get_text (),
'room: list (room. Stripped_strings),
'img: img. Get (' SRC'),
'cate: list (cate. Stripped_strings),
}
Print (data)
Info=json. Dumps (data, ensure_ascii=False)
Return the info

Def save_html (self, html_str page_num) :
File_path='first {} {} - pages. HTML'. The format (self. Anjuke_name page_num)
With the open (file_path, 'w', encoding="utf-8") as f:
F.w rite (html_str)

Def run (self) : # realize the main logic
# 1. Construct the url list
Url_list=self. Get_url_list ()
# 2. Traversal, send requests, get response
For the url in url_list:
Html_str=self. Parse_url (url)
# 3. Save the
Page_num=url_list. Index (url) + 1
Self. Save_html (html_str page_num)


If __name__=="__main__ ':
Zufang_spieder=zufangSpider (' zufang ')
Zufang_spieder. The run ()
  • Related