一、要求:
1、完成论文的题目、摘要、关键词、原文链接四项内容爬取;
2、存储到本地数据库中;
3、按照题目、关键词分类统计得到最热的十个领域方向;
4、热词越多,在热词云中显示的就越大,还要将热词与文章链接,点击热词云中的热词可以找到与之对应的文章题目;
二、效果:
爬取数据:
from lxml import etree from pymysql import connect from jieba.analyse import * import requests class CVPR: # 保存数据 def saveContent_list(self,title,zhaiyao,guanjian,lianjie): # 打开数据库连接(ip/数据库用户名/登录密码/数据库名) con = connect("localhost", "root", "a3685371", "pachong") # 使用 cursor() 方法创建一个游标对象 cursor cursors = con.cursor() # 使用 execute() 方法执行 SQL 查询 返回的是你影响的行数 row = cursors.execute("insert into CVPR values(%s,%s,%s,%s)", (title,zhaiyao,guanjian,lianjie)) # 使用 fetchone() 方法获取数据. con.commit() # 关闭数据库连接(别忘了) con.close() headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" } url = "http://openaccess.thecvf.com/CVPR2019.py" proxies = { "http": "http://211.147.226.4", "https": "http://122.200.90.12", } cvpr = CVPR() response = requests.get(url,headers=headers) html_str = etree.HTML(response.content.decode()) #获得标题 hrefs = html_str.xpath("//div[@id='content']/dl/dt/a/@href") for href in hrefs: href = "http://openaccess.thecvf.com/{0}".format(href) response2 = requests.get(href,headers=headers) html_str = etree.HTML(response2.content.decode()) lunwens = {} title = html_str.xpath("//div[@id='content']/dl/dd//div[@id='papertitle']/text()") lianjie = html_str.xpath("//div[@id='content']/dl/dd//a/@href") zhaiyao = html_str.xpath("//div[@id='content']/dl/dd//div[@id='abstract']/text()") for keyword, weight in extract_tags(zhaiyao[0].strip(), topK=5, withWeight=True): try: cvpr.saveContent_list(title,zhaiyao,keyword,lianjie) print("存入成功") except: print("存入失败")
使用echarts图表生成热词图:
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>Insert title here</title> <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" /> <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script> <script type="text/javascript" src="js/echarts.min.js"></script> <script type="text/javascript" src="js/china.js"></script> <script src="js/bootstrap.min.js" type="text/javascript"></script> <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> <script src='js/echarts-wordcloud.js'></script> </head> <body> <div id="main" style=" 100%;height: 400px"></div> <div> <table class="table" style=" 100%;align-content: center;" > <tr> <th align="center">论文连接</th> </tr> <c:forEach var="item" items="${list}"> <tr> <td><a href="${item.lianjie }">${item.title}</a></td> </tr> </c:forEach> </table> </div> <script> var chart = echarts.init(document.getElementById('main')); var dt; $.ajax({ url : "PaperServlet_", async : false, type : "POST", success : function(data) { dt = data; // alert(dt[0].title); }, error : function() { alert("请求失败"); }, dataType : "json" }); var mydata = new Array(0); for (var i = 0; i < dt.length; i++) { var d = {}; d["name"] = dt[i].name; //alert(dt[i].name); d["value"] = dt[i].value; mydata.push(d); } var option = { tooltip: {}, series: [ { type: 'wordCloud', gridSize: 2, sizeRange: [20, 50], rotationRange: [-90, 90], shape: 'pentagon', 600, height: 300, drawOutOfBound: true, textStyle: { normal: { color: function () { return 'rgb(' + [ Math.round(Math.random() * 160), Math.round(Math.random() * 160), Math.round(Math.random() * 160) ].join(',') + ')'; } }, emphasis: { shadowBlur: 10, shadowColor: '#333' } }, data: mydata } ] }; chart.setOption(option); chart.on('click', function (params) { var url = "ClickServlet?geunjian=" + params.name; window.location.href = url; }); window.onresize = chart.resize; </script> </body> </html>
传递参数:
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.google.gson.Gson; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; @WebServlet("/PaperServlet_") public class PaperServlet_ extends HttpServlet { private static final long serialVersionUID = 1L; public PaperServlet_() { super(); } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); List<Tu> list_tu = new ArrayList<Tu>(); String [] str = new String[10000]; String [] str_ = new String[10000]; int [] b = new int[10000]; int num = 0; int length1 = 0; try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } String[] split = list.get(i).getGuanjian().split(" "); for(int j=0;j<split.length;j++) { str[num++] = split[j]; } } for(int k=0;k<num;k++) { b[k]=0; } str_[0]=str[0]; int tt=1; Boolean rt=true; for(int i=1;i<num;i++) { rt=false; for(int j=0;j<tt;j++) { if(str[i].equals(str_[j])) { rt=true; break; } } if(!rt) { str_[tt]=str[i]; tt++; } } length1=tt; for(int i=0;i<length1;i++) { for(int j=0;j<num;j++) { if(str_[i].equals(str[j])) { b[i]++; } } } int t3=0; int t2=0; String sr=""; for(int i=0;i<length1-1;i++) { t3=i; for(int j=i+1;j<length1;j++) { if(b[t3]<b[j]) { t3=j; } } if(t3!=i) { t2=b[i]; b[i]=b[t3]; b[t3]=t2; sr=str_[i]; str_[i]=str_[t3]; str_[t3]=sr; } } for(int i=0;i<100;i++) { Tu tu = new Tu(); tu.name=str_[i]; tu.value= b[i]; list_tu.add(tu); } Gson gson = new Gson(); String json = gson.toJson(list_tu); response.getWriter().write(json); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
package com.me.domain; public class Tu { public String name; public int value; }
LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); List<Tu> list_tu = new ArrayList<Tu>(); String [] str = new String[10000]; String [] str_ = new String[10000]; int [] b = new int[10000]; int num = 0; int length1 = 0; try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } //分割成单词 for(int i=0;i<list.size();i++) { String[] split = list.get(i).getGuanjian().split(" "); for(int j=0;j<split.length;j++) { str[num++] = split[j]; } } //去重并计数 for(int k=0;k<num;k++) { b[k]=0; } str_[0]=str[0]; int tt=1; Boolean rt=true; for(int i=1;i<num;i++) { rt=false; for(int j=0;j<tt;j++) { if(str[i].equals(str_[j])) { rt=true; break; } } if(!rt) { str_[tt]=str[i]; tt++; } } length1=tt; for(int i=0;i<length1;i++) { for(int j=0;j<num;j++) { if(str_[i].equals(str[j])) { b[i]++; } } } //排序 int t3=0; int t2=0; String sr=""; for(int i=0;i<length1-1;i++) { t3=i; for(int j=i+1;j<length1;j++) { if(b[t3]<b[j]) { t3=j; } } if(t3!=i) { t2=b[i]; b[i]=b[t3]; b[t3]=t2; sr=str_[i]; str_[i]=str_[t3]; str_[t3]=sr; } } //封装 for(int i=0;i<100;i++) { Tu tu = new Tu(); tu.name=str_[i]; tu.value= b[i]; list_tu.add(tu); }
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; /** * Servlet implementation class LunServlet */ @WebServlet("/LunServlet") public class LunServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public LunServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list",list); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
package com.me.domain; public class LunWen { private String title; private String zhaiyao; private String guanjian; private String lianjie; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getZhaiyao() { return zhaiyao; } public void setZhaiyao(String zhaiyao) { this.zhaiyao = zhaiyao; } public String getGuanjian() { return guanjian; } public void setGuanjian(String guanjian) { this.guanjian = guanjian; } public String getLianjie() { return lianjie; } public void setLianjie(String lianjie) { this.lianjie = lianjie; } }
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; LWDao dao = new LWDao(); public ClickServlet() { super(); // TODO Auto-generated constructor stub } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String geunjian = request.getParameter("geunjian"); System.out.println(geunjian); List<LunWen> guan = new ArrayList<LunWen>(); try { guan = dao.login(geunjian); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<guan.size();i++) { if(guan.get(i).getLianjie()!=null) { String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length()); guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list", guan); System.out.println(guan.size()); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
package com.me.dao; import java.sql.SQLException; import java.util.List; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanListHandler; import com.me.domain.LunWen; import com.me.utils.DBUtils; public class LWDao { public List<LunWen> search_() throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from cvpr"; List<LunWen> query = qr.query(sql, new BeanListHandler<LunWen>(LunWen.class)); return query; } public List<LunWen> login(String guanjien) throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from cvpr where guanjian like "+"'%"+guanjien+"%'"; System.out.println(sql); List<LunWen> user01 = qr.query(sql, new BeanListHandler<LunWen>(LunWen.class)); return user01; } }
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.google.gson.Gson; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; @WebServlet("/PaperServlet_") public class PaperServlet_ extends HttpServlet { private static final long serialVersionUID = 1L; public PaperServlet_() { super(); } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); List<Tu> list_tu = new ArrayList<Tu>(); String [] str = new String[10000]; String [] str_ = new String[10000]; int [] b = new int[10000]; int num = 0; int length1 = 0; try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } String[] split = list.get(i).getGuanjian().split(" "); for(int j=0;j<split.length;j++) { str[num++] = split[j]; } } for(int k=0;k<num;k++) { b[k]=0; } str_[0]=str[0]; int tt=1; Boolean rt=true; for(int i=1;i<num;i++) { rt=false; for(int j=0;j<tt;j++) { if(str[i].equals(str_[j])) { rt=true; break; } } if(!rt) { str_[tt]=str[i]; tt++; } } length1=tt; for(int i=0;i<length1;i++) { for(int j=0;j<num;j++) { if(str_[i].equals(str[j])) { b[i]++; } } } int t3=0; int t2=0; String sr=""; for(int i=0;i<length1-1;i++) { t3=i; for(int j=i+1;j<length1;j++) { if(b[t3]<b[j]) { t3=j; } } if(t3!=i) { t2=b[i]; b[i]=b[t3]; b[t3]=t2; sr=str_[i]; str_[i]=str_[t3]; str_[t3]=sr; } } for(int i=0;i<100;i++) { Tu tu = new Tu(); tu.name=str_[i]; tu.value= b[i]; list_tu.add(tu); } Gson gson = new Gson(); String json = gson.toJson(list_tu); response.getWriter().write(json); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
1)热词实体
package com.me.domain; public class Tu { public String name; public int value; }
2)将关键字分割成单词然后对单词进行去重、计数和排序,装到list
LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); List<Tu> list_tu = new ArrayList<Tu>(); String [] str = new String[10000]; String [] str_ = new String[10000]; int [] b = new int[10000]; int num = 0; int length1 = 0; try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } //分割成单词 for(int i=0;i<list.size();i++) { String[] split = list.get(i).getGuanjian().split(" "); for(int j=0;j<split.length;j++) { str[num++] = split[j]; } } //去重并计数 for(int k=0;k<num;k++) { b[k]=0; } str_[0]=str[0]; int tt=1; Boolean rt=true; for(int i=1;i<num;i++) { rt=false; for(int j=0;j<tt;j++) { if(str[i].equals(str_[j])) { rt=true; break; } } if(!rt) { str_[tt]=str[i]; tt++; } } length1=tt; for(int i=0;i<length1;i++) { for(int j=0;j<num;j++) { if(str_[i].equals(str[j])) { b[i]++; } } } //排序 int t3=0; int t2=0; String sr=""; for(int i=0;i<length1-1;i++) { t3=i; for(int j=i+1;j<length1;j++) { if(b[t3]<b[j]) { t3=j; } } if(t3!=i) { t2=b[i]; b[i]=b[t3]; b[t3]=t2; sr=str_[i]; str_[i]=str_[t3]; str_[t3]=sr; } } //封装 for(int i=0;i<100;i++) { Tu tu = new Tu(); tu.name=str_[i]; tu.value= b[i]; list_tu.add(tu); }
4、论文连接列表数据准备(PaperServlet是最初访问的地方,携带数据跳转到jsp界面)
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; import com.me.domain.Tu; /** * Servlet implementation class LunServlet */ @WebServlet("/LunServlet") public class LunServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public LunServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setHeader("content-type", "text/html;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); LWDao dao = new LWDao(); List<LunWen> list = new ArrayList<LunWen>(); try { list = dao.search_(); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<list.size();i++) { if(list.get(i).getLianjie()!=null) { String ss = list.get(i).getLianjie().substring(6,list.get(i).getLianjie().length()); list.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list",list); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
1)论文实体
package com.me.domain; public class LunWen { private String title; private String zhaiyao; private String guanjian; private String lianjie; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getZhaiyao() { return zhaiyao; } public void setZhaiyao(String zhaiyao) { this.zhaiyao = zhaiyao; } public String getGuanjian() { return guanjian; } public void setGuanjian(String guanjian) { this.guanjian = guanjian; } public String getLianjie() { return lianjie; } public void setLianjie(String lianjie) { this.lianjie = lianjie; } }
5、点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表
package com.me.servlet; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; LWDao dao = new LWDao(); public ClickServlet() { super(); // TODO Auto-generated constructor stub } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String geunjian = request.getParameter("geunjian"); System.out.println(geunjian); List<LunWen> guan = new ArrayList<LunWen>(); try { guan = dao.login(geunjian); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<guan.size();i++) { if(guan.get(i).getLianjie()!=null) { String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length()); guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list", guan); System.out.println(guan.size()); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
6、dao层:
package com.me.dao; import java.sql.SQLException; import java.util.List; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanListHandler; import com.me.domain.LunWen; import com.me.utils.DBUtils; public class LWDao { public List<LunWen> search_() throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from cvpr"; List<LunWen> query = qr.query(sql, new BeanListHandler<LunWen>(LunWen.class)); return query; } public List<LunWen> login(String guanjien) throws SQLException { QueryRunner qr = new QueryRunner(DBUtils.getDataSource()); String sql = "select * from cvpr where guanjian like "+"'%"+guanjien+"%'"; System.out.println(sql); List<LunWen> user01 = qr.query(sql, new BeanListHandler<LunWen>(LunWen.class)); return user01; } }