zoukankan      html  css  js  c++  java
  • springboot+webmagic实现java爬虫jdbc及mysql

    前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。

    一、首先介绍一下webmagic:

    webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义UA/cookie等功能。

    实现理念:

    Maven依赖:

    <dependency>
          <groupId>us.codecraft</groupId>
          <artifactId>webmagic-core</artifactId>
          <version>0.7.3</version>
        </dependency>
        <dependency>
          <groupId>us.codecraft</groupId>
          <artifactId>webmagic-extension</artifactId>
          <version>0.7.3</version>
        </dependency>
    
        <dependency>
          <groupId>us.codecraft</groupId>
          <artifactId>webmagic-extension</artifactId>
          <version>0.7.3</version>
          <exclusions>
            <exclusion>
              <groupId>org.slf4j</groupId>
              <artifactId>slf4j-log4j12</artifactId>
            </exclusion>
          </exclusions>
        </dependency>

    jdbc模式:

    ublic class CsdnBlogDao {
        private Connection conn = null;
        private Statement stmt = null;
    
        public CsdnBlogDao() {
            try {
                Class.forName("com.mysql.jdbc.Driver");
                String url = "jdbc:mysql://localhost:3306/test?"
                        + "user=***&password=***3&useUnicode=true&characterEncoding=UTF8";
                conn = DriverManager.getConnection(url);
                stmt = conn.createStatement();
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
    
        }
    
        public int add(CsdnBlog csdnBlog) {
            try {
                String sql = "INSERT INTO `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?);";
                PreparedStatement ps = conn.prepareStatement(sql);
                ps.setInt(1, csdnBlog.getKey());
                ps.setString(2, csdnBlog.getTitle());
                ps.setString(3,csdnBlog.getContent());
                ps.setString(4, csdnBlog.getDates());
                ps.setString(5, csdnBlog.getTags());
                ps.setString(6, csdnBlog.getCategory());
                ps.setInt(7, csdnBlog.getView());
                ps.setInt(8, csdnBlog.getComments());
                ps.setInt(9, csdnBlog.getCopyright());
                return ps.executeUpdate();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            return -1;
        }
    }

    实体类:

    public class CsdnBlog {
        private int key;// 编号
    
        private String title;// 标题
    
        private String dates;// 日期
    
        private String tags;// 标签
    
        private String category;// 分类
    
        private int view;// 阅读人数
    
        private int comments;// 评论人数
    
        private int copyright;// 是否原创
    
        private String content; //文字内容
    
        public String getContent() {
            return content;
        }
    
        public void setContent(String content) {
            this.content = content;
        }
    
        public int getKey() {
            return key;
        }
    
        public void setKey(int key) {
            this.key = key;
        }
    
        public String getTitle() {
            return title;
        }
    
        public void setTitle(String title) {
            this.title = title;
        }
    
        public String getDates() {
            return dates;
        }
    
        public void setDates(String dates) {
            this.dates = dates;
        }
    
        public String getTags() {
            return tags;
        }
    
        public void setTags(String tags) {
            this.tags = tags;
        }
    
        public String getCategory() {
            return category;
        }
    
        public void setCategory(String category) {
            this.category = category;
        }
    
        public int getView() {
            return view;
        }
    
        public void setView(int view) {
            this.view = view;
        }
    
        public int getComments() {
            return comments;
        }
    
        public void setComments(int comments) {
            this.comments = comments;
        }
    
        public int getCopyright() {
            return copyright;
        }
    
        public void setCopyright(int copyright) {
            this.copyright = copyright;
        }
    
        @Override
        public String toString() {
            return "CsdnBlog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category="
                    + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";
        }
    }

    启动类:

    public class CsdnBlogPageProcessor implements PageProcessor {
    
    
        private static String username="CHENYUFENG1991";  // 设置csdn用户名
    
        private static int size = 0;// 共抓取到的文章数量
    
        // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    
        public Site getSite() {
            return site;
        }
    
    
        // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
        public void process(Page page) {
            // 列表页
            if (!page.getUrl().regex("http://blog\.csdn\.net/" + username + "/article/details/\d+").match()) {
                // 添加所有文章页
                page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域
                        .regex("/" + username + "/article/details/\d+")
                        .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
                        .all());
                // 添加其他列表页
                page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域
                        .regex("/" + username + "/article/list/\d+")
                        .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
                        .all());
                // 文章页
            } else {
                size++;// 文章数量加1
                // 用CsdnBlog类来存抓取到的数据,方便存入数据库
                CsdnBlog csdnBlog = new CsdnBlog();
                // 设置编号
                csdnBlog.setKey(Integer.parseInt(
                        page.getUrl().regex("http://blog\.csdn\.net/" + username + "/article/details/(\d+)").get()));
                // 设置标题
                csdnBlog.setTitle(
                        page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());
    
                //设置内容
                csdnBlog.setContent(
                        page.getHtml().xpath("//div[@class='article_content']/allText()").get());
    
                // 设置日期
                csdnBlog.setDates(
                        page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());
                // 设置标签(可以有多个,用,来分割)
                csdnBlog.setTags(listToString(page.getHtml().xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all()));
                // 设置类别(可以有多个,用,来分割)
                csdnBlog.setCategory(listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all()));
                // 设置阅读人数
                csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']")
                        .regex("(\d+)人阅读").get()));
                // 设置评论人数
                csdnBlog.setComments(Integer.parseInt(page.getHtml()
                        .xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\((\d+)\)").get()));
                // 设置是否原创
                csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0);
                // 把对象存入数据库
                new CsdnBlogDao().add(csdnBlog);
                // 把对象输出控制台
                System.out.println(csdnBlog);
            }
        }
    
        // 把list转换为string,用,分割
        public static String listToString(List<String> stringList) {
            if (stringList == null) {
                return null;
            }
            StringBuilder result = new StringBuilder();
            boolean flag = false;
            for (String string : stringList) {
                if (flag) {
                    result.append(",");
                } else {
                    flag = true;
                }
                result.append(string);
            }
            return result.toString();
        }
    
        public static void main(String[] args) {
            long startTime, endTime;
            System.out.println("【爬虫开始】...");
            startTime = System.currentTimeMillis();
            // 从用户博客首页开始抓,开启5个线程,启动爬虫
            Spider.create(new CsdnBlogPageProcessor()).addUrl("http://blog.csdn.net/" + username).thread(5).run();
            endTime = System.currentTimeMillis();
            System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!");
        }
    }

    使用mysql类型:

    public class GamePageProcessor implements PageProcessor {
    
        private static final Logger logger = LoggerFactory.getLogger(GamePageProcessor.class);
        private static DianJingService d;
        private static BannerService bs;
        private static SportService ss;
        private static YuLeNewsService ys;
    
        private static UpdateService ud ;
        // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    
        public Site getSite() {
            return site;
        }
        // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
        public static void main(String[] args) {
            ConfigurableApplicationContext context= SpringApplication.run(GamePageProcessor.class, args);
             d = context.getBean(DianJingService.class);
            //Spider.create(new GamePageProcessor()).addUrl("网址").thread(5).run();
        }
    
        public void process(Page page) {
            Selectable url = page.getUrl();
            if (url.toString().equals("网址")) {
                DianJingVideo dv = new DianJingVideo();
                List<String> ls = page.getHtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()").all();
                //hrefs
                List<String> ls1 = page.getHtml().xpath("//div[@class='v']/div[@class='v-link']/a/@href").all();//获取a标签的href
    
                List<String> ls2 = page.getHtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()").all();
                //photo
                List<String> ls3 = page.getHtml().xpath("//div[@class='v']/div[@class='v-thumb']/img/@src").all();
    
                for (int i = 0; i < 5; i++) {
                    dv.setTitles(ls.get(i));
                    dv.setCategory("");
                    dv.setDates(ls2.get(i));
                    dv.setHrefs(ls1.get(i));
                    dv.setPhoto(ls3.get(i));
                    dv.setSources("");
    
                    d.addVideo(dv);
    
                }
            }
    }

    Controller:

    @Controller
    @RequestMapping(value = "/dianjing")
    public class DianJingController {
        @Autowired
        private DianJingService s;
    
    
    
    
            /*
            手游
             */
        @RequestMapping("/dianjing")
        @ResponseBody
        public Object dianjing(){
            List<DianJing> list = s.find2();
            JSONObject jo = new JSONObject();
            if(list!=null){
    
                jo.put("code",0);
                jo.put("success",true);
                jo.put("count",list.size());
                jo.put("list",list);
            }
            return jo;
        }
    }

    实体类就不展示了

    dao层

        @Insert("insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})")
        int adddj(DianJing dj);
  • 相关阅读:
    一百一十五:CMS系统之实现点击更换图形验证码功能
    python用cx_Oracle连接oracle
    一百一十四:CMS系统之图形验证码生成
    一百一十三:CMS系统之前台注册界面
    一百一十二:CMS系统之前台用户模型
    一百一十一:CMS系统之后端权限验证功能
    前端开发之JavaScript HTML DOM理论篇二
    前端开发之JavaScript HTML DOM理论篇一
    前端开发之JavaScript基础篇四
    前端开发之JavaScript基础篇三
  • 原文地址:https://www.cnblogs.com/NCL--/p/8608336.html
Copyright © 2011-2022 走看看