zoukankan      html  css  js  c++  java
  • java 之webmagic 网络爬虫

    webmagic简介:

        WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。

        http://webmagic.io/

    准备工作:

      

      Maven依赖(我这里用的Maven创建的web项目做测试):    

    复制代码
    <dependencies>
    <!-- junit -->
    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.11</version>
    <scope>test</scope>
    </dependency>
    
    <!--日志配置 -->
    <dependency>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-api</artifactId>
    <version>1.7.12</version>
    </dependency>
    
    <dependency>
    <groupId>ch.qos.logback</groupId>
    <artifactId>logback-core</artifactId>
    <version>1.2.3</version>
    </dependency>
    <!-- 实现slf4j接口并整合 -->
    <dependency>
    <groupId>ch.qos.logback</groupId>
    <artifactId>logback-classic</artifactId>
    <version>1.2.3</version>
    </dependency>
    
    <!-- 数据库部分 -->
    <dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.34</version>
    <scope>runtime</scope>
    </dependency>
    <!-- c3p0连接池 -->
    <dependency>
    <groupId>c3p0</groupId>
    <artifactId>c3p0</artifactId>
    <version>0.9.1.2</version>
    </dependency>
    
    <!-- dao框架:mybatis -->
    <dependency>
    <groupId>org.mybatis</groupId>
    <artifactId>mybatis</artifactId>
    <version>3.4.0</version>
    </dependency>
    <!-- mybatis 整合spring -->
    <dependency>
    <groupId>org.mybatis</groupId>
    <artifactId>mybatis-spring</artifactId>
    <version>1.3.0</version>
    </dependency>
    
    <!-- servlet web依赖 -->
    <dependency>
    <groupId>taglibs</groupId>
    <artifactId>standard</artifactId>
    <version>1.1.2</version>
    </dependency>
    <dependency>
    <groupId>jstl</groupId>
    <artifactId>jstl</artifactId>
    <version>1.2</version>
    </dependency>
    <dependency>
    <groupId>com.fasterxml.jackson.core</groupId>
    <artifactId>jackson-databind</artifactId>
    <version>2.5.1</version>
    </dependency>
    <dependency>
    <groupId>javax.servlet</groupId>
    <artifactId>javax.servlet-api</artifactId>
    <version>3.1.0</version>
    </dependency>
    
    <!-- spring 依赖 -->
    <!-- 1.spring核心依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-core</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-beans</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-context</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <!-- 2.spring dao 层依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-jdbc</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-tx</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    
    <!-- spring web -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-web</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-webmvc</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    
    <!-- spring test 依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-test</artifactId>
    <version>4.2.6.RELEASE</version>
    </dependency>
    <!-- webmagic 网络爬虫jar -->
    <dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
    </dependency>
    </dependencies>
    复制代码

       数据库表SQL:

    复制代码
    CREATE TABLE `Boke` (
      `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
      `title` varchar(255) DEFAULT NULL COMMENT '标题',
      `linke` varchar(255) DEFAULT NULL COMMENT '正文地址',
      `author` varchar(255) DEFAULT NULL COMMENT '作者',
      `authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主页',
      `summary` varchar(1000) DEFAULT NULL COMMENT '简介',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
    复制代码

     

     数据库链接工具类:

    import java.sql.DriverManager;
    import java.sql.SQLException;

    import com.mysql.jdbc.Connection;

    public class MySqlJdbcUtils {

    private static String driver = "com.mysql.jdbc.Driver";
    private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
    private static String name="tradingbp";
    private static String pwd="123456";

    /**
    *
    * 获取链接
    *
    * @date 2017年8月31日
    * @return
    */
    public static Connection getOpenConnection(){
    Connection conn= null;
    try {
    //加载驱动
    Class.forName(driver);
    conn=(Connection) DriverManager.getConnection(url, name, pwd);
    System.out.println("获得数据库链接");
    } catch (ClassNotFoundException e) {
    e.printStackTrace();
    }catch (SQLException e) {
    e.printStackTrace();
    }
    return conn;
    }

    public static void main(String[] args) {
    getOpenConnection();
    }

    }


    复制代码
    import java.sql.DriverManager;
    import java.sql.SQLException;
    
    import com.mysql.jdbc.Connection;
    
    public class MySqlJdbcUtils {
    
        private static String driver = "com.mysql.jdbc.Driver";
        private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
        private static String name="tradingbp";
        private static String pwd="123456";
        
        /**
         * 
         * 获取链接
         *
         * @date   2017年8月31日
         * @return
         */
        public static Connection getOpenConnection(){
            Connection conn= null;
            try {
                //加载驱动
                Class.forName(driver);
                conn=(Connection) DriverManager.getConnection(url, name, pwd);
                System.out.println("获得数据库链接");
            } catch (ClassNotFoundException  e) {
                 e.printStackTrace();
            }catch (SQLException e) {
                e.printStackTrace();
            }
            return conn;
        }
        
        public static void main(String[] args) {
            getOpenConnection();
        }
        
    }
    复制代码

     实体类:

    /**
    *
    *java 博客实体
    *
    * @date 2017年8月24日
    * @see [相关类/方法]
    * @since [产品/模块版本]
    */
    public class JavaBokeModel {

    //标题
    private String title;

    //链接地址
    private String linke;

    //作者
    private String author;

    //作者主页地址
    private String authorUrl;

    //简介
    private String summary;


    public String getSummary() {
    return summary;
    }

    public void setSummary(String summary) {
    this.summary = summary;
    }

    public String getTitle() {
    return title;
    }

    public void setTitle(String title) {
    this.title = title;
    }

    public String getLinke() {
    return linke;
    }

    public void setLinke(String linke) {
    this.linke = linke;
    }

    public String getAuthor() {
    return author;
    }

    public void setAuthor(String author) {
    this.author = author;
    }

    public String getAuthorUrl() {
    return authorUrl;
    }

    public void setAuthorUrl(String authorUrl) {
    this.authorUrl = authorUrl;
    }


    }

    webmagic 框架爬取数据并保存

       

    复制代码
    import java.sql.PreparedStatement;
    import java.sql.SQLException;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    import com.mysql.jdbc.Connection;
    import com.nio.webmagic.jdbc.MySqlJdbcUtils;
    import com.nio.webmagic.model.JavaBokeModel;
    /**
     * 
     * 爬虫
     *
     * @version  [VCES V201R001, 2017年10月12日]
     *
     * @see 方法实现 PageProcessor 
     * @since  [产品/模块版本]
     */
    public class JavaBoKePageProcessor implements PageProcessor {
        private static Connection conn=null;
        private static PreparedStatement ps =null;
        //标题和链接获取
        private static String  TITLEQUERY="div.post_item_body h3 a.titlelnk";
        //作者
        private static String AUTHORQUERY="div.post_item_foot a.lightblue ";
        //简介
        private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";
        //插入sql语句
        private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";
        
        //初始链接
        private static Connection getConnection(){
            if (conn==null) {
                conn = MySqlJdbcUtils.getOpenConnection();
            }
            return conn;
        }
        
        /**
         * 
         * insert操作
         *
         * @date   2017年8月31日
         * @return
         */
        
        private synchronized void insertDb(List<JavaBokeModel> javaBokes){
            try {
                    
                 ps = conn.prepareStatement(insertSql);
                
                for (JavaBokeModel javaBoke:javaBokes) {
                    ps.setString(1, javaBoke.getTitle().toString());
                    ps.setString(2, javaBoke.getLinke().toString());
                    ps.setString(3, javaBoke.getAuthor().toString());
                    ps.setString(4, javaBoke.getAuthorUrl().toString());
                    ps.setString(5, javaBoke.getSummary().toString());
                    ps.executeUpdate();
                }
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        
        //初始化带爬取网页地址
        private static List<String> urls(){
            List<String> listUrl =new ArrayList<String>();
            
            for (int i = 2; i <=200; i++) {
                
                //listUrl.add("http://www.cnblogs.com/cate/java/"+i);
                    listUrl.add("http://www.cnblogs.com/cate/java/"+i);
            }
            listUrl.toArray(new String[listUrl.size()]);
            return listUrl;
        }
        
        /**
         * 
         * jsoup根据 html 字符串和语法获取内容;
         * @date   2017年8月31日
         * @param htmlText
         * @return
         */
        private static String seletDocumentText(String htmlText,String Query){
            Document doc = Jsoup.parse(htmlText);
            String select = doc.select(Query).text();
            return select;
        }
        
        /**
         * 
         * jsoup根据 html 字符串和语法获取链接地址;
        
         * @date   2017年8月31日
         * @param htmlText
         * @return
         */
        private static String seletDocumentLink(String htmlText,String Query){
            Document doc = Jsoup.parse(htmlText);
            String select = doc.select(Query).attr("href");
            return select;
        }
        /**
         *    process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
         * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page)
         */
        @Override
        public void process(Page page) {
            //
            page.addTargetRequests(urls());
            //div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()'
            // 定义如何抽取页面信息,并保存下来
            List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all();
            List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
            for (String html:htmls) {
                JavaBokeModel javaBoke =new JavaBokeModel();
                //标题和链接
                String title =seletDocumentText(html,TITLEQUERY);
                String linke =seletDocumentLink(html,TITLEQUERY);
                //作者和作者主页
                String author=seletDocumentText(html, AUTHORQUERY);
                String authorUrl=seletDocumentLink(html, AUTHORQUERY);
                //简介
                String summary=seletDocumentText(html, SUMMARYQUERY);
                javaBoke.setTitle(title);
                javaBoke.setAuthor(author);
                javaBoke.setAuthorUrl(authorUrl);
                javaBoke.setLinke(linke);
                javaBoke.setSummary(summary);
                javaBokes.add(javaBoke);
                
            }
            insertDb(javaBokes);
            
        }
    
        @Override
        public Site getSite() {
            //抓去网站的相关配置包括:编码、重试次数、抓取间隔
            return Site.me().setSleepTime(1000).setRetryTimes(10);
        }
        
        public static void main(String[] args) {
            long startTime ,endTime;
            System.out.println("========小爬虫【启动】喽!=========");
            getConnection();
            startTime = new Date().getTime();
            //入口
            Spider create = Spider.create(new JavaBoKePageProcessor());
            //定义入口地址
            create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); 
            try {
                ps.close();
                conn.close();
            } catch (Exception e) {
                // TODO: handle exception
            }
            endTime = new Date().getTime();
            System.out.println("========小爬虫【结束】喽!=========");
            System.out.println("用时为:"+(endTime-startTime)/1000+"s");
        }
    
    }
    复制代码

    数据:

  • 相关阅读:
    IIS的各种身份验证详细测试
    HTTP Error 401.3 Unauthorized Error While creating IIS 7.0 web site on Windows 7
    C/S and B/S
    WCF ContractFilter mismatch at the EndpointDispatcher exception
    Configure WCF
    Inheritance VS Composition
    Unhandled Error in Silverlight Application, code 2103 when changing the namespace
    Java RMI VS TCP Socket
    Principles Of Object Oriented Design
    Socket处理发送和接收数据包,一个小实例:
  • 原文地址:https://www.cnblogs.com/aibabel/p/11017558.html
Copyright © 2011-2022 走看看