zoukankan      html  css  js  c++  java
  • Java使用HttpClient和Jsoup爬取豆瓣小组的帖子并存入Mysql

    首先,确定要爬取的小组,本次以豆瓣的five组为例。因为是第一次用Java爬虫,所有采取了简答的暴力循环爬取的方法,以后有时间再继续改进。(不过也可能转到Python了)

    另外,本次尝试采用Spring boot开发。

    1、爬取所有的帖子的链接

    1.1、分析网页

    小组首页

    更多讨论,然后将地址栏的参数修改成0

    然后往后一直到最后,就是这个小组的全部帖子了,我们只需要将每一页的链接全部取出来,然后更新地址栏的start参数,遍历到最后,就可以获取所有的帖子的链接了。

    1.2、 创建数据库

    CREATE DATABASE `douban`;
    
    USE `douban`;
    
    
    # 存储帖子的标题和链接
    DROP TABLE IF EXISTS `985posts`;
    
    CREATE TABLE 985posts (
    	id INT NOT NULL AUTO_INCREMENT COMMENT '帖子id',
    	title VARCHAR(150) NOT NULL COMMENT '帖子标题',
    	author VARCHAR(50) NOT NULL COMMENT '帖子作者',
    	post_href VARCHAR(100) NOT NULL COMMENT '帖子的url地址',
    	KEY id(id)
    ) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;
    

    编码使用utf8mb4的原因是帖子中有emoji,每个emoji是4个字符。

    1.3、 编码

    1.3.1、 引入相关jar包

    <!--HttpClient-->
    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.12</version>
    </dependency>
    
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    

    这是主要的两个jar包:httpclient和jsoup,其他的看需要再引入,比如mybatis,lombok等等。

    1.3.2、 mybatis操作数据库

    实体类

    package com.fan.pojo;
    
    import lombok.AllArgsConstructor;
    import lombok.Data;
    import lombok.NoArgsConstructor;
    
    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    public class DoubanPost {
        private int id;
        private String title;
        private String author;
        private String postHref;
    }
    

    mapper类

    package com.fan.mapper;
    
    import com.fan.pojo.DoubanPost;
    import org.apache.ibatis.annotations.Mapper;
    import org.springframework.stereotype.Repository;
    
    import java.util.List;
    
    /**
     * 这个类是用来给985posts数据表进行增删改查的
     * @author 
     * @date 2020/8/23 - 2:18
     */
    
    // 这个注解表示了这是一个mybatis的mapper类
    @Mapper
    @Repository
    public interface DoubanPostMapper {
    
        List<DoubanPost> queryDoubanPostList();
    
        DoubanPost queryDoubanPostById(int id);
    
        int addDoubanPost(DoubanPost doubanPost);
    
        int deleteDoubanPost(int id);
    
    }
    

    xml文件

    <?xml version="1.0" encoding="UTF-8" ?>
    <!DOCTYPE mapper
            PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
            "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
    <mapper namespace="com.fan.mapper.DoubanPostMapper">
    
        <select id="queryDoubanPostList" resultType="DoubanPost">
            select * from douban.985posts;
        </select>
    
        <select id="queryDoubanPostById" resultType="DoubanPost">
            select * from douban.985posts where id=#{id};
        </select>
    
        <insert id="addDoubanPost" parameterType="DoubanPost">
            insert into douban.985posts (title, author, post_href) VALUES (#{title}, #{author}, #{postHref});
        </insert>
    
        <delete id="deleteDoubanPost" parameterType="int">
            delete from douban.985posts where id=#{id};
        </delete>
    
    </mapper>
    

    1.3.3、获取网页html工具类

    package com.fan.util;
    
    import org.apache.http.HttpHost;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.net.URISyntaxException;
    import java.util.Map;
    
    /**
     * @author 
     * @date 2020/8/23 - 1:28
     */
    public class HttpUtils {
    
        // 创建连接池管理器-->存疑
        private static PoolingHttpClientConnectionManager cm;
    
        // 创建一个代理
        // http://120.79.209.11:3128
        // 61.160.245.88 46779
        // 171.35.215.90 9999
        // private static HttpHost proxy = new HttpHost("61.178.118.86", 8080);
    
    
        public HttpUtils() {
            cm = new PoolingHttpClientConnectionManager();
            // 设置最大连接数
            cm.setMaxTotal(100);
            // 设置每个路由的最大连接数
            cm.setDefaultMaxPerRoute(10);
        }
    
        // 配置请求信息
        private static RequestConfig getConfig() {
            RequestConfig config = RequestConfig.custom()
                    .setConnectTimeout(100 * 1000) // 创建连接的最长时间,单位毫秒
                    .setConnectionRequestTimeout(100 * 1000) // 设置获取连接的最长时间,单位毫秒
                    .setSocketTimeout(100 *1000) // 设置数据传输的最长时间,单位毫秒
                    .build();
            return config;
        }
    
    
        /**
         * 根据请求地址下载页面数据
         * @param url 请求路径
         * @param map 请求参数
         * @param mapTitle 请求头
         * @return // 页面数据
         * @throws URISyntaxException
         */
        public static String doGetHtml(String url, Map<String, String> map, Map<String, String> mapTitle) throws URISyntaxException {
            // 创建HttpClient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            // 设置请求地址
            // 创建URIBuilder
            URIBuilder uriBuilder = new URIBuilder(url);
    
            // 设置参数
            if (!map.isEmpty()) {
                for (String key : map.keySet()) {
                    uriBuilder.setParameter(key, map.get(key));
                }
            }
    
            // 创建HttpGet对象,设置url访问地址
            // uriBuilder.build()得到请求地址
            HttpGet httpGet = new HttpGet(uriBuilder.build());
    
            // 设置请求头信息
            if (!mapTitle.isEmpty()) {
                for (String key : mapTitle.keySet()) {
                    httpGet.addHeader(key, mapTitle.get(key));
                }
            }
    
            // 设置请求信息
            httpGet.setConfig(getConfig());
            System.out.println("这里注意观察");
            System.out.println("发起的请求信息: " + httpGet);
    
            // 使用HttpClient发起请求,获取response
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                // 解析响应体Entity是否为空,如果不为空就可以使用EntityUtils
                if (response.getEntity() != null) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    return content;
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                // 关闭response
                // 猜测,这里的httpClient应该是交给了连接池管理
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
            System.out.println("获取失败");
            return "";
        }
    
    }
    

    1.3.4、编写测试类

    package com.fan;
    
    import com.fan.mapper.DoubanPostMapper;
    import com.fan.pojo.DoubanPost;
    import com.fan.util.HttpUtils;
    import org.json.JSONObject;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Attributes;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.junit.jupiter.api.Test;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.boot.test.context.SpringBootTest;
    
    import java.net.URISyntaxException;
    import java.util.HashMap;
    import java.util.Map;
    
    @SpringBootTest
    class CrawdoubangroupsApplicationTests {
    
        @Autowired
        private DoubanPostMapper doubanPostMapper;
    
        @Test
        void contextLoads() throws URISyntaxException {
            // 请求地址
            // https://www.douban.com/group/692739/discussion?start=0
            String url = "https://www.douban.com/group/692739/discussion";
            Map<String, String> map = new HashMap<>();
            Map<String, String> mapTitle = new HashMap<>();
            // 设置请求参数
            // map.put("start", "0");
            // 设置请求头
            mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
            mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmz=30149280.1592482199.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; __utmc=30149280; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598182478.1598185708.18; ck=PnCH; __utmt=1; ap_v=0,6.0; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.19.1598187695.1598182755.; __utmb=30149280.5.10.1598185708");
    
    
            DoubanPost doubanPost = new DoubanPost();
    
            // 先抓取前1000(大概)条数据,并存入数据库
            for (int k = 1; k < 14062;) {
                // 改变请求参数,因为小组页面每页只有30个list循环是每次要加30
                map.put("start", k + "");
                // 工具类获取html
                String html01 = HttpUtils.doGetHtml(url, map, mapTitle);
                System.out.println("保护html============================================================================================》debug");
                System.out.println(html01);
                System.out.println("保护html============================================================================================》debug");
                // Jsoup解析html
                Document document = Jsoup.parse(html01);
                // 利用Jsoup获取content
                Element content = document.getElementById("content");
                // 获取tr标签下的所有html内容
                Elements trs = content.getElementsByTag("tr");
                // 遍历tr元素下的内容,舍弃第一个标题头
                for (int i = 1; i < trs.size(); i++) {
                    // 根据索引获取对应的element,即单个tr
                    Element element = trs.get(i);
                    // 获取tr下面的孩子td
                    Elements children = element.children();
                    // System.out.println(children);
                    // System.out.
                    // println(children);
                    // 处理第一行数据,取出标题和链接
                    Element child = children.get(0).child(0);
                    // System.out.println(child);
                    Attributes attributes = child.attributes();
                    // System.out.println("文章链接为:");
                    String href = attributes.get("href");
                    // System.out.println(href);
                    // 设置实体类
                    doubanPost.setPostHref(href);
                    // System.out.println("标题为:");
                    String title = attributes.get("title");
                    // System.out.println(title);
                    // 设置实体类
                    doubanPost.setTitle(title);
    
                    if (title.equals("")) {
                        System.out.println("执行了方法====》");
    
                        child = children.get(0).child(1);
                        // System.out.println(child);
                        attributes = child.attributes();
                        // System.out.println("文章链接为:");
                        href = attributes.get("href");
                        // System.out.println(href);
                        // 设置实体类
                        doubanPost.setPostHref(href);
                        // System.out.println("标题为:");
                        title = attributes.get("title");
                        // System.out.println(title);
                        // 设置实体类
                        doubanPost.setTitle(title);
                    }
    
    
                    // 处理第二行数据,取出作者和链接
                    Element child1 = children.get(1).child(0);
                    Attributes attributes1 = child1.attributes();
                    String href1 = attributes1.get("href");
                    doubanPost.setAuthor(href1);
    
                    // 打印实体类
                    System.out.println(doubanPost);
                    doubanPostMapper.addDoubanPost(doubanPost);
    
                    System.out.println(i+1 + "===============");
                }
    
                System.out.println("分割=====》start = " + k);
                k += trs.size() - 1;
    
            }
    
        }
    
    }
    

    嗯,就是暴力获取,最后爬取了大概一万四七多条数据。

    数据库中的结果:

    2、根据第一步获取的链接,获取每一个具体的帖子的html网页

    这里的数据库操作和第一步的很类似,故从略。只展示最后的测试类。

    package com.fan;
    
    import com.fan.mapper.DoubanPostMapper;
    import com.fan.mapper.PostMapper;
    import com.fan.pojo.DoubanPost;
    import com.fan.pojo.Post;
    import com.fan.util.HttpUtils;
    import org.apache.http.protocol.HTTP;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Attributes;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.junit.jupiter.api.Test;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.boot.test.context.SpringBootTest;
    
    import java.net.URISyntaxException;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    /**
     * 爬取豆瓣主帖子的内容
     * @author 
     * @date 2020/8/24 - 14:08
     */
    
    @SpringBootTest
    public class CrawDoubanMainPosts {
    
        @Autowired
        private DoubanPostMapper doubanPostMapper;
    
        @Autowired
        private PostMapper postMapper;
    
    
        // 批量查询帖子的html,并将主帖子的内容存入数据库中
        @Test
        public void test01() throws URISyntaxException {
            // 请求地址
            // 每一个帖子对应的url
            // String url = null;
            Map<String, String> map = new HashMap<>();
            Map<String, String> mapTitle = new HashMap<>();
            // 设置请求头
            mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
            mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; ct=y; douban-profile-remind=1; ck=PnCH; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1598334460%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fwww.douban.com%252Fgroup%252Ftopic%252F175432568%252F%22%5D; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.27.1598334460.1598257938.; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598255406.1598334460.26; __utmc=30149280; __utmz=30149280.1598334460.26.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; __utmb=30149280.4.8.1598334460");
    
            DoubanPost doubanPost = null; // 每一次查询的链接类
            Post post = new Post();
    
            for (int i = 11; i < 14068; i++) {
                doubanPost = doubanPostMapper.queryDoubanPostById(i);
                // 装填title
                post.setTitle(doubanPost.getTitle());
                // 装填authorHref
                post.setAuthorHref(doubanPost.getAuthor());
                // 装填postHref
                post.setPostHref(doubanPost.getPostHref());
    
                // 获取postid
                String postHref = doubanPost.getPostHref(); // 后面根据这个url获取网页
                String s = postHref.substring(35, postHref.length() - 1);
                int postId = Integer.parseInt(s);
                // 装填postId
                post.setPostId(postId);
    
                // 接下来只需要装填author和content
                String html = HttpUtils.doGetHtml(postHref, map, mapTitle);
                Document document = Jsoup.parse(html);
                Element content = document.getElementById("content");
                // 防止报空指针异常
                if (content == null) {
                    System.out.println("内容不存在");
                    post.setContent("内容不存在");
                    // 装填昵称
                    String username = "此人已删帖";
                    post.setAuthor(username);
                    // 保存进数据库
                    postMapper.addPost(post);
                    System.out.println("添加第" + i + "条数据成功!");
                    continue;
                }
                Element elementById = content.getElementById("link-report");
                Elements p = elementById.getElementsByTag("p");
                String article = "";
                for (Element element : p) {
                    String html02 = element.html();
                    html02 += "\n";
                    article += html02;
                }
                // 装填文章内容
                post.setContent(article);
                // 装填昵称
                String username = content.child(0).child(0).child(2).child(1).child(0).child(0).child(0).html();
                post.setAuthor(username);
    
                // 保存进数据库
                postMapper.addPost(post);
                System.out.println("添加第" + i + "条数据成功!");
                // 舍弃图片和链接,暂时只截取文字部分
            }
            System.out.println("前1000条数据获取完成");
        }
    
    }
    

    数据库的结果:

    这里只是爬取了主帖的内容,并没有爬取评论的内容,爬完之后已经被豆瓣限制ip了(爬了这么多,豆瓣才限制我,豆瓣对新手真是太友好了,当然,豆瓣的高手还是很多的,要反爬虫还是很轻松的)。同时,使用java还是有些慢的,虽然我目前没有用多线程,但是估计用了多线程,操作还是很繁琐,代码量目测比Python要多好多,打算后续爬取小组的帖子使用Python了。Java还是写一些网站吧。

  • 相关阅读:
    程序员应该看的书籍列表
    完整版QQ(腾讯)开放平台操作指南(包含:qq登录能力获取等等)
    使用Electron构建跨平台的抓取桌面程序
    LinqHelper拓展
    20161014001 DataGridView 单元格内容 自动计算
    20161013001 DataGridView 数据转 DataTable
    20161011001 treeView 递归
    20160929001 Guid生成
    20160815001
    20160715001
  • 原文地址:https://www.cnblogs.com/fanlumaster/p/13574848.html
Copyright © 2011-2022 走看看