zoukankan      html  css  js  c++  java
  • JSOUP爬虫示例

    利用JSOUP做爬虫,爬取我博客中的所有标题加链接,代码示例如下:

    package com.test.jsoup;
    
    import java.io.IOException;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class TestDemo4 {
    	
    	public String baseUrl = "http://www.cnblogs.com/zhangfei/p/";
    	
    	public String pager = "?page=%s";
    	
    	public int getAllPageCount(){
    		int count = 0;
    		try {
    			Document doc = Jsoup.connect(baseUrl).get();
    			String countText = doc.select("#myposts>div.pager:nth-of-type(1)>.Pager").text();
    			countText = countText.replaceFirst("\D+(\d+).*", "$1");
    			count = Integer.valueOf(countText);
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		return count;
    	}
    	
    	public void crawler(){
    		int count = this.getAllPageCount();
    		for (int i = 1; i <= count; i++) {
    			String url = baseUrl + String.format(pager, i);
    			this.testJsop(url);
    		}
    	}
    
    	public void testJsop(String url) {
    		try {
    			Document doc = Jsoup.connect(url).get();
    			Elements element = doc.select("div.PostList a");
    			for (Element e : element) {
    				String text = e.text();
    				String href = e.attr("href");
    				System.out.println(text+" : "+href);
    			}			
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    	}
    
    	public static void main(String[] args) {
    		TestDemo4 t = new TestDemo4();
    		t.crawler();
    	}
    }
    
  • 相关阅读:
    华为机试练习(一)
    LM拟合算法
    5.1 模块化程序设计
    第3周 运算的流程控制
    KEGG数据库介绍
    topGO
    GO.db
    Bioconductor应用领域之基因芯片
    org.Hs.eg.db包简介(转换NCBI、ensemble等数据库中基因ID,symbol等之间的转换)
    Bioconductor的历史
  • 原文地址:https://www.cnblogs.com/zhangfei/p/4729279.html
Copyright © 2011-2022 走看看