zoukankan      html  css  js  c++  java
  • java 之webmagic 网络爬虫

    webmagic简介:

        WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。

        http://webmagic.io/

    准备工作:

      

      Maven依赖(我这里用的Maven创建的web项目做测试):    

    复制代码
    <dependencies>
    <!-- junit -->
    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.11</version>
    <scope>test</scope>
    </dependency>
    
    <!--日志配置 -->
    <dependency>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-api</artifactId>
    <version>1.7.12</version>
    </dependency>
    
    <dependency>
    <groupId>ch.qos.logback</groupId>
    <artifactId>logback-core</artifactId>
    <version>1.2.3</version>
    </dependency>
    <!-- 实现slf4j接口并整合 -->
    <dependency>
    <groupId>ch.qos.logback</groupId>
    <artifactId>logback-classic</artifactId>
    <version>1.2.3</version>
    </dependency>
    
    <!-- 数据库部分 -->
    <dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.34</version>
    <scope>runtime</scope>
    </dependency>
    <!-- c3p0连接池 -->
    <dependency>
    <groupId>c3p0</groupId>
    <artifactId>c3p0</artifactId>
    <version>0.9.1.2</version>
    </dependency>
    
    <!-- dao框架:mybatis -->
    <dependency>
    <groupId>org.mybatis</groupId>
    <artifactId>mybatis</artifactId>
    <version>3.4.0</version>
    </dependency>
    <!-- mybatis 整合spring -->
    <dependency>
    <groupId>org.mybatis</groupId>
    <artifactId>mybatis-spring</artifactId>
    <version>1.3.0</version>
    </dependency>
    
    <!-- servlet web依赖 -->
    <dependency>
    <groupId>taglibs</groupId>
    <artifactId>standard</artifactId>
    <version>1.1.2</version>
    </dependency>
    <dependency>
    <groupId>jstl</groupId>
    <artifactId>jstl</artifactId>
    <version>1.2</version>
    </dependency>
    <dependency>
    <groupId>com.fasterxml.jackson.core</groupId>
    <artifactId>jackson-databind</artifactId>
    <version>2.5.1</version>
    </dependency>
    <dependency>
    <groupId>javax.servlet</groupId>
    <artifactId>javax.servlet-api</artifactId>
    <version>3.1.0</version>
    </dependency>
    
    <!-- spring 依赖 -->
    <!-- 1.spring核心依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-core</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-beans</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-context</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <!-- 2.spring dao 层依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-jdbc</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-tx</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    
    <!-- spring web -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-web</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-webmvc</artifactId>
    <version>4.2.5.RELEASE</version>
    </dependency>
    
    <!-- spring test 依赖 -->
    <dependency>
    <groupId>org.springframework</groupId>
    <artifactId>spring-test</artifactId>
    <version>4.2.6.RELEASE</version>
    </dependency>
    <!-- webmagic 网络爬虫jar -->
    <dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
    </dependency>
    </dependencies>
    复制代码

       数据库表SQL:

    复制代码
    CREATE TABLE `Boke` (
      `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
      `title` varchar(255) DEFAULT NULL COMMENT '标题',
      `linke` varchar(255) DEFAULT NULL COMMENT '正文地址',
      `author` varchar(255) DEFAULT NULL COMMENT '作者',
      `authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主页',
      `summary` varchar(1000) DEFAULT NULL COMMENT '简介',
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
    复制代码

     

     数据库链接工具类:

    import java.sql.DriverManager;
    import java.sql.SQLException;

    import com.mysql.jdbc.Connection;

    public class MySqlJdbcUtils {

    private static String driver = "com.mysql.jdbc.Driver";
    private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
    private static String name="tradingbp";
    private static String pwd="123456";

    /**
    *
    * 获取链接
    *
    * @date 2017年8月31日
    * @return
    */
    public static Connection getOpenConnection(){
    Connection conn= null;
    try {
    //加载驱动
    Class.forName(driver);
    conn=(Connection) DriverManager.getConnection(url, name, pwd);
    System.out.println("获得数据库链接");
    } catch (ClassNotFoundException e) {
    e.printStackTrace();
    }catch (SQLException e) {
    e.printStackTrace();
    }
    return conn;
    }

    public static void main(String[] args) {
    getOpenConnection();
    }

    }


    复制代码
    import java.sql.DriverManager;
    import java.sql.SQLException;
    
    import com.mysql.jdbc.Connection;
    
    public class MySqlJdbcUtils {
    
        private static String driver = "com.mysql.jdbc.Driver";
        private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
        private static String name="tradingbp";
        private static String pwd="123456";
        
        /**
         * 
         * 获取链接
         *
         * @date   2017年8月31日
         * @return
         */
        public static Connection getOpenConnection(){
            Connection conn= null;
            try {
                //加载驱动
                Class.forName(driver);
                conn=(Connection) DriverManager.getConnection(url, name, pwd);
                System.out.println("获得数据库链接");
            } catch (ClassNotFoundException  e) {
                 e.printStackTrace();
            }catch (SQLException e) {
                e.printStackTrace();
            }
            return conn;
        }
        
        public static void main(String[] args) {
            getOpenConnection();
        }
        
    }
    复制代码

     实体类:

    /**
    *
    *java 博客实体
    *
    * @date 2017年8月24日
    * @see [相关类/方法]
    * @since [产品/模块版本]
    */
    public class JavaBokeModel {

    //标题
    private String title;

    //链接地址
    private String linke;

    //作者
    private String author;

    //作者主页地址
    private String authorUrl;

    //简介
    private String summary;


    public String getSummary() {
    return summary;
    }

    public void setSummary(String summary) {
    this.summary = summary;
    }

    public String getTitle() {
    return title;
    }

    public void setTitle(String title) {
    this.title = title;
    }

    public String getLinke() {
    return linke;
    }

    public void setLinke(String linke) {
    this.linke = linke;
    }

    public String getAuthor() {
    return author;
    }

    public void setAuthor(String author) {
    this.author = author;
    }

    public String getAuthorUrl() {
    return authorUrl;
    }

    public void setAuthorUrl(String authorUrl) {
    this.authorUrl = authorUrl;
    }


    }

    webmagic 框架爬取数据并保存

       

    复制代码
    import java.sql.PreparedStatement;
    import java.sql.SQLException;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    import com.mysql.jdbc.Connection;
    import com.nio.webmagic.jdbc.MySqlJdbcUtils;
    import com.nio.webmagic.model.JavaBokeModel;
    /**
     * 
     * 爬虫
     *
     * @version  [VCES V201R001, 2017年10月12日]
     *
     * @see 方法实现 PageProcessor 
     * @since  [产品/模块版本]
     */
    public class JavaBoKePageProcessor implements PageProcessor {
        private static Connection conn=null;
        private static PreparedStatement ps =null;
        //标题和链接获取
        private static String  TITLEQUERY="div.post_item_body h3 a.titlelnk";
        //作者
        private static String AUTHORQUERY="div.post_item_foot a.lightblue ";
        //简介
        private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";
        //插入sql语句
        private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";
        
        //初始链接
        private static Connection getConnection(){
            if (conn==null) {
                conn = MySqlJdbcUtils.getOpenConnection();
            }
            return conn;
        }
        
        /**
         * 
         * insert操作
         *
         * @date   2017年8月31日
         * @return
         */
        
        private synchronized void insertDb(List<JavaBokeModel> javaBokes){
            try {
                    
                 ps = conn.prepareStatement(insertSql);
                
                for (JavaBokeModel javaBoke:javaBokes) {
                    ps.setString(1, javaBoke.getTitle().toString());
                    ps.setString(2, javaBoke.getLinke().toString());
                    ps.setString(3, javaBoke.getAuthor().toString());
                    ps.setString(4, javaBoke.getAuthorUrl().toString());
                    ps.setString(5, javaBoke.getSummary().toString());
                    ps.executeUpdate();
                }
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        
        //初始化带爬取网页地址
        private static List<String> urls(){
            List<String> listUrl =new ArrayList<String>();
            
            for (int i = 2; i <=200; i++) {
                
                //listUrl.add("http://www.cnblogs.com/cate/java/"+i);
                    listUrl.add("http://www.cnblogs.com/cate/java/"+i);
            }
            listUrl.toArray(new String[listUrl.size()]);
            return listUrl;
        }
        
        /**
         * 
         * jsoup根据 html 字符串和语法获取内容;
         * @date   2017年8月31日
         * @param htmlText
         * @return
         */
        private static String seletDocumentText(String htmlText,String Query){
            Document doc = Jsoup.parse(htmlText);
            String select = doc.select(Query).text();
            return select;
        }
        
        /**
         * 
         * jsoup根据 html 字符串和语法获取链接地址;
        
         * @date   2017年8月31日
         * @param htmlText
         * @return
         */
        private static String seletDocumentLink(String htmlText,String Query){
            Document doc = Jsoup.parse(htmlText);
            String select = doc.select(Query).attr("href");
            return select;
        }
        /**
         *    process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
         * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page)
         */
        @Override
        public void process(Page page) {
            //
            page.addTargetRequests(urls());
            //div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()'
            // 定义如何抽取页面信息,并保存下来
            List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all();
            List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
            for (String html:htmls) {
                JavaBokeModel javaBoke =new JavaBokeModel();
                //标题和链接
                String title =seletDocumentText(html,TITLEQUERY);
                String linke =seletDocumentLink(html,TITLEQUERY);
                //作者和作者主页
                String author=seletDocumentText(html, AUTHORQUERY);
                String authorUrl=seletDocumentLink(html, AUTHORQUERY);
                //简介
                String summary=seletDocumentText(html, SUMMARYQUERY);
                javaBoke.setTitle(title);
                javaBoke.setAuthor(author);
                javaBoke.setAuthorUrl(authorUrl);
                javaBoke.setLinke(linke);
                javaBoke.setSummary(summary);
                javaBokes.add(javaBoke);
                
            }
            insertDb(javaBokes);
            
        }
    
        @Override
        public Site getSite() {
            //抓去网站的相关配置包括:编码、重试次数、抓取间隔
            return Site.me().setSleepTime(1000).setRetryTimes(10);
        }
        
        public static void main(String[] args) {
            long startTime ,endTime;
            System.out.println("========小爬虫【启动】喽!=========");
            getConnection();
            startTime = new Date().getTime();
            //入口
            Spider create = Spider.create(new JavaBoKePageProcessor());
            //定义入口地址
            create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); 
            try {
                ps.close();
                conn.close();
            } catch (Exception e) {
                // TODO: handle exception
            }
            endTime = new Date().getTime();
            System.out.println("========小爬虫【结束】喽!=========");
            System.out.println("用时为:"+(endTime-startTime)/1000+"s");
        }
    
    }
    复制代码

    数据:

  • 相关阅读:
    3.4
    3.3 TensorFlow运行模型 ------- 会话
    3.2 TensorFlow数据模型 ---- 张量
    3.1 TensorFlow计算模型 --- 计算图
    寻找两个有序数组的中位数
    最长子串
    vector的遍历删除
    超时空大决战
    面经七
    面经五
  • 原文地址:https://www.cnblogs.com/aibabel/p/11017558.html
Copyright © 2011-2022 走看看