Package net. Chinaedu. Utils; import java.io.IOException; Import the Java. Util. ArrayList; import java.util.List; The import org. Jsoup. Jsoup; The import org. Jsoup. Nodes. The Document; The import org. Jsoup. Nodes. Element; The import org. Jsoup. Select. Elements; The import net. Chinaedu. Bean. The Phone;/* * * use Jsoup crawler, and the data parsing, wrapped in Phone object, and stored in the List in the collection * * @ author Administrator * */public class JsoupUtils {/* * * get specifies the source link, and to Document the Document object returned @ url param * @ return * * * @ throws IOException */public static Document getHtmlDocument (String url) throws IOException {//crawl web source, to get the Document object Document Document=Jsoup. Connect (url). The get ();//will return to return the document web document object; }/* * * parse web page, will parse out the contents of the package to Phone first entity class, and then keep to List in the collection, and returning * * @ param @ the return document * */public static List
GetPhoneList (Document Document) {//1. Define a can hold Phone collection of object List PhoneList=new ArrayList (a);//2. Begin to parse the document//2.1 by tag attributes with multiple Elements Elements Elements=document. GetElementsByAttribute (" data - follow - id ");//2.2 traverse multiple tags object for (int I=0; I & lt; Elements. The size (); I++) {//2.3 for each tag Element Element Element=elements. The get (I);//2.4 Phone object for each attribute value//against 2.4.1 Phone name/* * getElementsByTag (" img ") : get img tags * attr (" Alt ") : get the Alt attribute value */String name=element. GetElementsByTag (" img "). Attr (" Alt ");//configured 2.4.2 String config=element. GetElementsByTag (" span "). The first (). The text ();//2.4.3 get price String priceStr=element. GetElementsByClass (" price - type "). The text ();//price into a string type double price/* * double parseDouble (priceStr) : the string class, the decimal into double * Integer parseInt (intStr) : the string's Integer into int */double price=(priceStr==null)? Zero: Double parseDouble (priceStr);//String 2.4.4 get score scoreStr=element. GetElementsByClass (" score "). The text (); Ratings into//the string of type double score double score=(scoreStr==null)? Zero: Double parseDouble (scoreStr);//2.4.5 get comments on several String numStr=element. GetElementsByClass (" comment - num "). The text ();//get the number in the comments on several string int num=StringUtils. GetNumByString (numStr);//2.4.6 gets hot list String rank=element. GetElementsByClass (" rank - row "). The text ();//2.4.7 get cell phone pictures of String img=element. GetElementsByTag (" img "). The first (). The attr (" SRC ");//2.5 using the above seven values build Phone Phone=new Phone (the name, the config, price, score, num, rank, img);//2.7 hold the phone to the collection phoneList. Add (phone);
//2.8 to download photos to the local
//output log
System. Out.println (" [] "+ name +" crawl over... "); }
//return collection objects
Return phoneList. }}