首先,确定要爬取的小组,本次以豆瓣的five组为例。因为是第一次用Java爬虫,所有采取了简答的暴力循环爬取的方法,以后有时间再继续改进。(不过也可能转到Python了)
另外,本次尝试采用Spring boot开发。
1、爬取所有的帖子的链接
1.1、分析网页
小组首页
更多讨论,然后将地址栏的参数修改成0
然后往后一直到最后,就是这个小组的全部帖子了,我们只需要将每一页的链接全部取出来,然后更新地址栏的start参数,遍历到最后,就可以获取所有的帖子的链接了。
1.2、 创建数据库
CREATE DATABASE `douban`;
USE `douban`;
# 存储帖子的标题和链接
DROP TABLE IF EXISTS `985posts`;
CREATE TABLE 985posts (
id INT NOT NULL AUTO_INCREMENT COMMENT '帖子id',
title VARCHAR(150) NOT NULL COMMENT '帖子标题',
author VARCHAR(50) NOT NULL COMMENT '帖子作者',
post_href VARCHAR(100) NOT NULL COMMENT '帖子的url地址',
KEY id(id)
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;
编码使用utf8mb4的原因是帖子中有emoji,每个emoji是4个字符。
1.3、 编码
1.3.1、 引入相关jar包
<!--HttpClient-->
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
这是主要的两个jar包:httpclient和jsoup,其他的看需要再引入,比如mybatis,lombok等等。
1.3.2、 mybatis操作数据库
实体类
package com.fan.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class DoubanPost {
private int id;
private String title;
private String author;
private String postHref;
}
mapper类
package com.fan.mapper;
import com.fan.pojo.DoubanPost;
import org.apache.ibatis.annotations.Mapper;
import org.springframework.stereotype.Repository;
import java.util.List;
/**
* 这个类是用来给985posts数据表进行增删改查的
* @author
* @date 2020/8/23 - 2:18
*/
// 这个注解表示了这是一个mybatis的mapper类
@Mapper
@Repository
public interface DoubanPostMapper {
List<DoubanPost> queryDoubanPostList();
DoubanPost queryDoubanPostById(int id);
int addDoubanPost(DoubanPost doubanPost);
int deleteDoubanPost(int id);
}
xml文件
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.fan.mapper.DoubanPostMapper">
<select id="queryDoubanPostList" resultType="DoubanPost">
select * from douban.985posts;
</select>
<select id="queryDoubanPostById" resultType="DoubanPost">
select * from douban.985posts where id=#{id};
</select>
<insert id="addDoubanPost" parameterType="DoubanPost">
insert into douban.985posts (title, author, post_href) VALUES (#{title}, #{author}, #{postHref});
</insert>
<delete id="deleteDoubanPost" parameterType="int">
delete from douban.985posts where id=#{id};
</delete>
</mapper>
1.3.3、获取网页html工具类
package com.fan.util;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Map;
/**
* @author
* @date 2020/8/23 - 1:28
*/
public class HttpUtils {
// 创建连接池管理器-->存疑
private static PoolingHttpClientConnectionManager cm;
// 创建一个代理
// http://120.79.209.11:3128
// 61.160.245.88 46779
// 171.35.215.90 9999
// private static HttpHost proxy = new HttpHost("61.178.118.86", 8080);
public HttpUtils() {
cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
cm.setMaxTotal(100);
// 设置每个路由的最大连接数
cm.setDefaultMaxPerRoute(10);
}
// 配置请求信息
private static RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(100 * 1000) // 创建连接的最长时间,单位毫秒
.setConnectionRequestTimeout(100 * 1000) // 设置获取连接的最长时间,单位毫秒
.setSocketTimeout(100 *1000) // 设置数据传输的最长时间,单位毫秒
.build();
return config;
}
/**
* 根据请求地址下载页面数据
* @param url 请求路径
* @param map 请求参数
* @param mapTitle 请求头
* @return // 页面数据
* @throws URISyntaxException
*/
public static String doGetHtml(String url, Map<String, String> map, Map<String, String> mapTitle) throws URISyntaxException {
// 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 设置请求地址
// 创建URIBuilder
URIBuilder uriBuilder = new URIBuilder(url);
// 设置参数
if (!map.isEmpty()) {
for (String key : map.keySet()) {
uriBuilder.setParameter(key, map.get(key));
}
}
// 创建HttpGet对象,设置url访问地址
// uriBuilder.build()得到请求地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
// 设置请求头信息
if (!mapTitle.isEmpty()) {
for (String key : mapTitle.keySet()) {
httpGet.addHeader(key, mapTitle.get(key));
}
}
// 设置请求信息
httpGet.setConfig(getConfig());
System.out.println("这里注意观察");
System.out.println("发起的请求信息: " + httpGet);
// 使用HttpClient发起请求,获取response
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
// 解析响应体Entity是否为空,如果不为空就可以使用EntityUtils
if (response.getEntity() != null) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭response
// 猜测,这里的httpClient应该是交给了连接池管理
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("获取失败");
return "";
}
}
1.3.4、编写测试类
package com.fan;
import com.fan.mapper.DoubanPostMapper;
import com.fan.pojo.DoubanPost;
import com.fan.util.HttpUtils;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
@SpringBootTest
class CrawdoubangroupsApplicationTests {
@Autowired
private DoubanPostMapper doubanPostMapper;
@Test
void contextLoads() throws URISyntaxException {
// 请求地址
// https://www.douban.com/group/692739/discussion?start=0
String url = "https://www.douban.com/group/692739/discussion";
Map<String, String> map = new HashMap<>();
Map<String, String> mapTitle = new HashMap<>();
// 设置请求参数
// map.put("start", "0");
// 设置请求头
mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmz=30149280.1592482199.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; __utmc=30149280; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598182478.1598185708.18; ck=PnCH; __utmt=1; ap_v=0,6.0; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.19.1598187695.1598182755.; __utmb=30149280.5.10.1598185708");
DoubanPost doubanPost = new DoubanPost();
// 先抓取前1000(大概)条数据,并存入数据库
for (int k = 1; k < 14062;) {
// 改变请求参数,因为小组页面每页只有30个list循环是每次要加30
map.put("start", k + "");
// 工具类获取html
String html01 = HttpUtils.doGetHtml(url, map, mapTitle);
System.out.println("保护html============================================================================================》debug");
System.out.println(html01);
System.out.println("保护html============================================================================================》debug");
// Jsoup解析html
Document document = Jsoup.parse(html01);
// 利用Jsoup获取content
Element content = document.getElementById("content");
// 获取tr标签下的所有html内容
Elements trs = content.getElementsByTag("tr");
// 遍历tr元素下的内容,舍弃第一个标题头
for (int i = 1; i < trs.size(); i++) {
// 根据索引获取对应的element,即单个tr
Element element = trs.get(i);
// 获取tr下面的孩子td
Elements children = element.children();
// System.out.println(children);
// System.out.
// println(children);
// 处理第一行数据,取出标题和链接
Element child = children.get(0).child(0);
// System.out.println(child);
Attributes attributes = child.attributes();
// System.out.println("文章链接为:");
String href = attributes.get("href");
// System.out.println(href);
// 设置实体类
doubanPost.setPostHref(href);
// System.out.println("标题为:");
String title = attributes.get("title");
// System.out.println(title);
// 设置实体类
doubanPost.setTitle(title);
if (title.equals("")) {
System.out.println("执行了方法====》");
child = children.get(0).child(1);
// System.out.println(child);
attributes = child.attributes();
// System.out.println("文章链接为:");
href = attributes.get("href");
// System.out.println(href);
// 设置实体类
doubanPost.setPostHref(href);
// System.out.println("标题为:");
title = attributes.get("title");
// System.out.println(title);
// 设置实体类
doubanPost.setTitle(title);
}
// 处理第二行数据,取出作者和链接
Element child1 = children.get(1).child(0);
Attributes attributes1 = child1.attributes();
String href1 = attributes1.get("href");
doubanPost.setAuthor(href1);
// 打印实体类
System.out.println(doubanPost);
doubanPostMapper.addDoubanPost(doubanPost);
System.out.println(i+1 + "===============");
}
System.out.println("分割=====》start = " + k);
k += trs.size() - 1;
}
}
}
嗯,就是暴力获取,最后爬取了大概一万四七多条数据。
数据库中的结果:
2、根据第一步获取的链接,获取每一个具体的帖子的html网页
这里的数据库操作和第一步的很类似,故从略。只展示最后的测试类。
package com.fan;
import com.fan.mapper.DoubanPostMapper;
import com.fan.mapper.PostMapper;
import com.fan.pojo.DoubanPost;
import com.fan.pojo.Post;
import com.fan.util.HttpUtils;
import org.apache.http.protocol.HTTP;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 爬取豆瓣主帖子的内容
* @author
* @date 2020/8/24 - 14:08
*/
@SpringBootTest
public class CrawDoubanMainPosts {
@Autowired
private DoubanPostMapper doubanPostMapper;
@Autowired
private PostMapper postMapper;
// 批量查询帖子的html,并将主帖子的内容存入数据库中
@Test
public void test01() throws URISyntaxException {
// 请求地址
// 每一个帖子对应的url
// String url = null;
Map<String, String> map = new HashMap<>();
Map<String, String> mapTitle = new HashMap<>();
// 设置请求头
mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; ct=y; douban-profile-remind=1; ck=PnCH; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1598334460%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fwww.douban.com%252Fgroup%252Ftopic%252F175432568%252F%22%5D; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.27.1598334460.1598257938.; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598255406.1598334460.26; __utmc=30149280; __utmz=30149280.1598334460.26.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; __utmb=30149280.4.8.1598334460");
DoubanPost doubanPost = null; // 每一次查询的链接类
Post post = new Post();
for (int i = 11; i < 14068; i++) {
doubanPost = doubanPostMapper.queryDoubanPostById(i);
// 装填title
post.setTitle(doubanPost.getTitle());
// 装填authorHref
post.setAuthorHref(doubanPost.getAuthor());
// 装填postHref
post.setPostHref(doubanPost.getPostHref());
// 获取postid
String postHref = doubanPost.getPostHref(); // 后面根据这个url获取网页
String s = postHref.substring(35, postHref.length() - 1);
int postId = Integer.parseInt(s);
// 装填postId
post.setPostId(postId);
// 接下来只需要装填author和content
String html = HttpUtils.doGetHtml(postHref, map, mapTitle);
Document document = Jsoup.parse(html);
Element content = document.getElementById("content");
// 防止报空指针异常
if (content == null) {
System.out.println("内容不存在");
post.setContent("内容不存在");
// 装填昵称
String username = "此人已删帖";
post.setAuthor(username);
// 保存进数据库
postMapper.addPost(post);
System.out.println("添加第" + i + "条数据成功!");
continue;
}
Element elementById = content.getElementById("link-report");
Elements p = elementById.getElementsByTag("p");
String article = "";
for (Element element : p) {
String html02 = element.html();
html02 += "\n";
article += html02;
}
// 装填文章内容
post.setContent(article);
// 装填昵称
String username = content.child(0).child(0).child(2).child(1).child(0).child(0).child(0).html();
post.setAuthor(username);
// 保存进数据库
postMapper.addPost(post);
System.out.println("添加第" + i + "条数据成功!");
// 舍弃图片和链接,暂时只截取文字部分
}
System.out.println("前1000条数据获取完成");
}
}
数据库的结果:
这里只是爬取了主帖的内容,并没有爬取评论的内容,爬完之后已经被豆瓣限制ip了(爬了这么多,豆瓣才限制我,豆瓣对新手真是太友好了,当然,豆瓣的高手还是很多的,要反爬虫还是很轻松的)。同时,使用java还是有些慢的,虽然我目前没有用多线程,但是估计用了多线程,操作还是很繁琐,代码量目测比Python要多好多,打算后续爬取小组的帖子使用Python了。Java还是写一些网站吧。