任务分解:
- 爬取CVPR2019论文到数据库中;
- 分析、查找出关键词,并排序;
- 生成热词汇云图;
- 点击热词云中的热词可以找到与之对应的文章题目;
一、python爬虫
分析网站的JSON数据,利用Python爬取数据,然后存储到Mysql数据库中。爬取的数据为论文的题目和摘要内容,关键词通过将题目拆分获得。

import requests import pymysql from bs4 import BeautifulSoup #链接到本地数据库 db = pymysql.connect('127.0.0.1', port=3306, user='root', password='root', db='test', charset='utf8') cursor = db.cursor() #定义头文件 headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } #get方法抓取数据 url="http://openaccess.thecvf.com/CVPR2019.py" html=requests.get(url) #使用 Beautiful Soup 解析网页 soup=BeautifulSoup(html.content,'html.parser') soup.a.contents=='pdf' pdfs=soup.findAll(name="a",text="pdf") lis = [] jianjie="" for i,pdf in enumerate(pdfs): pdf_name=pdf["href"].split('/')[-1] name=pdf_name.split('.')[0].replace("_CVPR_2019_paper","") link="http://openaccess.thecvf.com/content_CVPR_2019/html/"+name+"_CVPR_2019_paper.html" url1=link print(url1) html1 = requests.get(url1) soup1 = BeautifulSoup(html1.content, 'html.parser') weizhi = soup1.find('div', attrs={'id':'abstract'}) if weizhi: jianjie =weizhi.get_text(); print("ok") info = {} info['title'] = name info['link'] =link info['abstract']=jianjie lis.append(info) print(lis) cursor = db.cursor() for i in range(len(lis)): cols = ", ".join('`{}`'.format(k) for k in lis[i].keys()) print(cols) # '`name`, `age`' val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys()) print(val_cols) # '%(name)s, %(age)s' sql = "insert into lunwen(%s) values(%s)" res_sql = sql % (cols, val_cols) print(res_sql) cursor.execute(res_sql, lis[i]) # 将字典a传入 db.commit() print("ok")
数据库的结构:
二、提取关键词
dao层:

package dao; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.stream.Collectors; import Bean.Data; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.List; import jdbc.*; public class Dao { public static Map<String,Integer> getrc() { String sql="select * from lunwen"; Map<String, Integer>map= new HashMap<String, Integer>(); Map<String, Integer>results= new LinkedHashMap<String, Integer>(); Connection con=null; Statement state=null; ResultSet rs=null; con=Util.getConn(); try { state=con.createStatement(); rs=state.executeQuery(sql); while(rs.next()) { String keywords=rs.getString("keywords"); //System.out.println(keywords); String[] split = keywords.split(","); for(int i=0;i<split.length;i++) { if(map.get(split[i])==null) { map.put(split[i],0); } else { map.replace(split[i], map.get(split[i])+1); } } } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } Util.close(rs, state, con); map.entrySet() .stream() .sorted((p1, p2) -> p2.getValue().compareTo(p1.getValue())) .collect(Collectors.toList()) .forEach(ele -> results.put(ele.getKey(), ele.getValue())); return results; } public List<Data> list(String keywords) { // 查询所有信息 List<Data> list = new ArrayList<Data>(); // 创建集合 Connection conn = Util.getConn(); String sql = "select * from lunwen where keywords like "+"'%"+keywords+"%'"; // SQL查询语句 try { PreparedStatement pst = conn.prepareStatement(sql); ResultSet rs = pst.executeQuery(); Data data = null; while (rs.next()) { String title = rs.getString("title"); String link = rs.getString("link"); String as= rs.getString("abstract"); data = new Data(title,link,as,keywords); list.add(data); } rs.close(); // 关闭 pst.close(); // 关闭 } catch (SQLException e1) { e1.printStackTrace(); // 抛出异常 } return list; // 返回一个集合 } public static List<Data> list2() { // 查询所有信息 List<Data> list2 = new ArrayList<Data>(); // 创建集合 Connection conn = Util.getConn(); String sql = "select * from lunwen "; // SQL查询语句 try { PreparedStatement pst = conn.prepareStatement(sql); ResultSet rs = pst.executeQuery(); Data data = null; while (rs.next()) { String title = rs.getString("title"); String link = rs.getString("link"); String as= rs.getString("abstract"); String keywords=rs.getString("keywords"); data = new Data(title,link,as,keywords); list2.add(data); } rs.close(); // 关闭 pst.close(); // 关闭 } catch (SQLException e1) { e1.printStackTrace(); // 抛出异常 } return list2; // 返回一个集合 } }
bean:

package Bean; public class Data { private String title; private String link; private String as; private String keywords; private int value; public int getValue() { return this.value; } public void SetValue(int value){ this.value=value; } public String getTitle() { return this.title; } public void SetTitle(String title){ this.title=title; } public String getLink() { return this.link; } public void SetLink(String link){ this.link=link; } public String getAs() { return this.as; } public void SetAs(String as){ this.as=as; } public String getKeywords() { return this.keywords; } public void SetKeywords(String keywords){ this.keywords=keywords; } public Data(String title,String link,String as,String keywords ) { this.title=title; this.link=link; this.as=as; this.keywords=keywords; } public Data(String title,int value) { this.title=title; this.value=value; } }
三、热词云可视化展示:

<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <%request.setCharacterEncoding("utf-8"); response.setCharacterEncoding("utf-8");%> <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>Hot-words</title> </head> <meta charset="UTF-8"> <link type="text/css" rel="stylesheet" href="css/style.css"> <script src="js/jquery-3.4.1.min.js"></script> <script src="js/echarts.min.js"></script> <script src="js/echarts-cloud.js"></script> <style> #main{ 30%; height: 500px; border:1px solid #ddd; float:right; } #table{ overflow-x: auto; overflow-y: auto; 60%; height: 500px; float:left; margin-top:100dp; padding-top:100dp; } </style> </head> <body > <br> <h1>Hot-words</h1> <br> <br> <br> <div id="table"> <table id='gradient-style' > <tr> <th align="center">link</th> </tr> <c:forEach var="item" items="${list}"> <tr> <td><a href="${item.link}">${item.title}</a></td> </tr> </c:forEach> </table> </div> <div id="main"> </div> <script type="text/javascript"> var dt; $.ajax({ url : "RcServlet", async : true, type : "POST", data : { }, dataType : "json", success : function(data) { dt = data; var mydata = new Array(0); for (var i = 0; i < dt.length; i++) { var d = {}; d["name"] = dt[i].name; d["value"] = dt[i].value; mydata.push(d); } var myChart = echarts.init(document.getElementById('main')); //设置点击效果 myChart.setOption({ title: { text: '' }, tooltip: {}, series: [{ type : 'wordCloud', //类型为字符云 shape:'smooth', //平滑 gridSize : 8, //网格尺寸 size : ['50%','50%'], //sizeRange : [ 50, 100 ], rotationRange : [-45, 0, 45, 90], //旋转范围 textStyle : { normal : { fontFamily:'微软雅黑', color: function() { return 'rgb(' + Math.round(Math.random() * 255) + ', ' + Math.round(Math.random() * 255) + ', ' + Math.round(Math.random() * 255) + ')' } }, emphasis : { shadowBlur : 5, //阴影距离 shadowColor : '#333' //阴影颜色 } }, left: 'center', top: 'center', right: null, bottom: null, '100%', height:'100%', data:mydata }] }); myChart.on('click', function (params) { var url = "ClickServlet?keywords=" + params.name; window.location.href = url; }); alert("成功!"); }, error : function() { alert("请求失败"); }, }); </script> </body> </html>
四、点击热词之后的跳转:
clickservlet:

package servlet; import java.io.IOException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import Bean.Data; import dao.Dao; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public ClickServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub response.getWriter().append("Served at: ").append(request.getContextPath()); doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { System.out.println("ClickServlet 运行"); request.setCharacterEncoding("UTF-8"); String keywords=request.getParameter("keywords"); Dao dao = new Dao(); List<Data> list=null; list=dao.list(keywords); System.out.println(list); request.setAttribute("list",list); request.getRequestDispatcher("Rc.jsp").forward(request, response); } }
linkservlet:

package servlet; import java.io.IOException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import Bean.Data; import dao.Dao; /** * Servlet implementation class Linkservlet */ @WebServlet("/Linkservlet") public class Linkservlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public Linkservlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub System.out.println("LinkServlet 运行"); response.getWriter().append("Served at: ").append(request.getContextPath()); response.setContentType("text/html;charset=UTF-8"); request.setCharacterEncoding("UTF-8"); String method = request.getParameter("method"); if(method.equals("find")) { find(request,response); } } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ private void find(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html;charset=UTF-8"); request.setCharacterEncoding("UTF-8"); List<Data> list=null; list=Dao.list2(); System.out.println(list); request.setAttribute("list", list); request.getRequestDispatcher("Rc.jsp").forward(request,response); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
rcservlet:

package servlet; import java.io.IOException; import java.sql.Date; import java.util.List; import java.util.Map; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import Bean.Data; import dao.Dao; import net.sf.json.JSONArray; import net.sf.json.JSONObject; @WebServlet("/RcServlet") public class RcServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public RcServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { this.doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { System.out.println("Rcservlet 运行"); response.setContentType("text/html;charset=UTF-8"); request.setCharacterEncoding("UTF-8"); Map<String, Integer>sortMap=Dao.getrc(); JSONArray json =new JSONArray(); int k=0; for (Map.Entry<String, Integer> entry : sortMap.entrySet()) { JSONObject ob=new JSONObject(); ob.put("name", entry.getKey()); ob.put("value", entry.getValue()); if(!(entry.getKey().equals("for")||entry.getKey().equals("and")||entry.getKey().equals("With")||entry.getKey().equals("of")||entry.getKey().equals("in")||entry.getKey().equals("From")||entry.getKey().equals("A")||entry.getKey().equals("to")||entry.getKey().equals("a")||entry.getKey().equals("the")||entry.getKey().equals("by"))) { json.add(ob); k++; } if(k==10) break; } System.out.println(json.toString());//输出JSON数据 response.getWriter().write(json.toString()); } }
五、结果展示: