Home > Back-end >  How to crawl more than the content of the page url corresponding to the Excel (one page site content
How to crawl more than the content of the page url corresponding to the Excel (one page site content

Time:11-25

Package net. Chinaedu. Utils; import java.io.IOException; Import the Java. Util. ArrayList; import java.util.List; The import org. Jsoup. Jsoup; The import org. Jsoup. Nodes. The Document; The import org. Jsoup. Nodes. Element; The import org. Jsoup. Select. Elements; The import net. Chinaedu. Bean. The Phone;/* * * use Jsoup crawler, and the data parsing, wrapped in Phone object, and stored in the List in the collection * * @ author Administrator * */public class JsoupUtils {/* * * get specifies the source link, and to Document the Document object returned @ url param * @ return * * * @ throws IOException */public static Document getHtmlDocument (String url) throws IOException {//crawl web source, to get the Document object Document Document=Jsoup. Connect (url). The get ();//will return to return the document web document object; }/* * * parse web page, will parse out the contents of the package to Phone first entity class, and then keep to List in the collection, and returning * * @ param @ the return document * */public static List GetPhoneList (Document Document) {//1. Define a can hold Phone collection of object List PhoneList=new ArrayList (a);//2. Begin to parse the document//2.1 by tag attributes with multiple Elements Elements Elements=document. GetElementsByAttribute (" data - follow - id ");//2.2 traverse multiple tags object for (int I=0; I & lt; Elements. The size (); I++) {//2.3 for each tag Element Element Element=elements. The get (I);//2.4 Phone object for each attribute value//against 2.4.1 Phone name/* * getElementsByTag (" img ") : get img tags * attr (" Alt ") : get the Alt attribute value */String name=element. GetElementsByTag (" img "). Attr (" Alt ");//configured 2.4.2 String config=element. GetElementsByTag (" span "). The first (). The text ();//2.4.3 get price String priceStr=element. GetElementsByClass (" price - type "). The text ();//price into a string type double price/* * double parseDouble (priceStr) : the string class, the decimal into double * Integer parseInt (intStr) : the string's Integer into int */double price=(priceStr==null)? Zero: Double parseDouble (priceStr);//String 2.4.4 get score scoreStr=element. GetElementsByClass (" score "). The text (); Ratings into//the string of type double score double score=(scoreStr==null)? Zero: Double parseDouble (scoreStr);//2.4.5 get comments on several String numStr=element. GetElementsByClass (" comment - num "). The text ();//get the number in the comments on several string int num=StringUtils. GetNumByString (numStr);//2.4.6 gets hot list String rank=element. GetElementsByClass (" rank - row "). The text ();//2.4.7 get cell phone pictures of String img=element. GetElementsByTag (" img "). The first (). The attr (" SRC ");//2.5 using the above seven values build Phone Phone=new Phone (the name, the config, price, score, num, rank, img);//2.7 hold the phone to the collection phoneList. Add (phone);
//2.8 to download photos to the local
//output log
System. Out.println (" [] "+ name +" crawl over... "); }
//return collection objects
Return phoneList. }}
  • Related