我的队友是徐姣美 这是她的博客 https://home.cnblogs.com/u/xjmm/
开始就是先从网站上面爬取相关信息 https://blog.csdn.net/u014636245/article/details/91426736 是在这个网站上面
代码的爬取我是使用的py,下面是我的代码
1 import requests 2 from bs4 import BeautifulSoup 3 import bs4 4 # -*- coding: UTF-8 -* 5 from urllib.request import urlopen 6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf 7 from pdfminer.converter import TextConverter 8 from pdfminer.layout import LAParams 9 from io import StringIO 10 from pyhanlp import * 11 import time 12 13 import requests 14 import json 15 from pymysql import * 16 17 #连接数据库的方法 18 def connectDB(): 19 try: 20 db=connect(host='localhost',port=3306,user='root',password='123456',db='python') 21 print("数据库连接成功") 22 return db 23 except Exception as e: 24 print(e) 25 return NULL 26 27 db = connectDB() 28 29 #向数据库中插入数据的方法 30 def insertInformation(title,abstract,keywords,href): 31 cursor=db.cursor() 32 try: 33 cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href)) 34 print("插入成功") 35 db.commit() 36 cursor.close() 37 return True 38 except Exception as e: 39 print(e) 40 db.rollback() 41 return False 42 43 list_href=[] 44 list_title=[] 45 46 def getHtmlText(url): 47 r = requests.get(url) 48 r.raise_for_status() 49 r.encoding = r.apparent_encoding 50 html = r.text 51 return html 52 53 54 55 def getDataFromHtml(list,html): 56 bs = BeautifulSoup(html, "lxml") 57 for td in bs.tbody.find_all("td"): 58 if isinstance(td,bs4.element.Tag): 59 for a in td.find_all("a"): 60 list_href.append(a['href']) 61 list_title.append(a.text) 62 63 def showAll(list): 64 for univ in list: 65 print(univ) 66 67 68 def readPDF(pdfFile): 69 rsrcmgr = PDFResourceManager() 70 retstr = StringIO() 71 laparams = LAParams() 72 device = TextConverter(rsrcmgr, retstr, laparams=laparams) 73 process_pdf(rsrcmgr, device, pdfFile) 74 device.close() 75 content = retstr.getvalue() 76 retstr.close() 77 return content 78 79 if __name__ == '__main__': 80 url = "https://blog.csdn.net/u014636245/article/details/91426736" 81 try: 82 html = getHtmlText(url) 83 getDataFromHtml(list,html) 84 for i in range(0,len(list_title)): 85 print(i) 86 pdfFile = urlopen(list_href[i]) 87 # 远程 88 outputString = readPDF(pdfFile) 89 if "Abstract" in outputString: 90 document = "" 91 if "1. Introduction" in outputString and "Abstract" in outputString: 92 document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")] 93 elif "1.Introduction" in outputString and "Abstract" in outputString: 94 document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")] 95 else : 96 document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800] 97 # print(document) 98 keywords = HanLP.extractKeyword(document, 10) 99 print(keywords) 100 str = "" 101 for k in keywords: 102 str+=k+" " 103 pdfFile.close() 104 insertInformation(list_title[i],document,str,list_href[i]) 105 time.sleep(0.1) 106 except Exception as e: 107 print(e) 108 print("爬取失败")
然后爬取结束后是这个样子
有很多很多条 ,关键词是每个keyword里面有10个关键词;
然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;
不要忘记将介词等无用词去掉;
进行排序最简单的是使用的map
// 排序 List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet()); //在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标 Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) { return (left.getValue().compareTo(right.getValue())); } }; // 集合默认升序升序 Collections.sort(list,comparator); String ten[]=new String[50]; int shu[]=new int[50]; for(int i=0;i<50;i++){// 由高到低输出 ten[i]=list.get(list.size()-i-1).getKey(); shu[i]=list.get(list.size()-i-1).getValue(); Tu tu =new Tu(); tu.name=ten[i]; tu.value=shu[i]; list_tu.add(tu); System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue()); }
然后设置一个点击事件,转换成json的代码形式
Gson gson = new Gson(); String json = gson.toJson(list_tu); response.getWriter().write(json);
然后使用echarts设计热词云
1 <%@ page language="java" contentType="text/html; charset=UTF-8" 2 pageEncoding="UTF-8"%> 3 <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 4 <!DOCTYPE html> 5 <html> 6 <head> 7 <meta charset="UTF-8"> 8 <title>Insert title here</title> 9 <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" /> 10 <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script> 11 <script type="text/javascript" src="js/echarts.min.js"></script> 12 <script type="text/javascript" src="js/china.js"></script> 13 <script src="js/bootstrap.min.js" type="text/javascript"></script> 14 <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> 15 <script src='js/echarts-wordcloud.js'></script> 16 </head> 17 <body> 18 <div id="main" style=" 100%;height: 400px"></div> 19 <div> 20 <table class="table" style=" 100%;align-content: center;" > 21 <tr> 22 <th align="center">论文连接</th> 23 </tr> 24 <c:forEach var="item" items="${list}"> 25 <tr> 26 <td><a href="${item.lianjie }">${item.title}</a></td> 27 </tr> 28 </c:forEach> 29 </table> 30 </div> 31 <script> 32 var chart = echarts.init(document.getElementById('main')); 33 var dt; 34 $.ajax({ 35 url : "PaperServlet_", 36 async : false, 37 type : "POST", 38 success : function(data) { 39 dt = data; 40 // alert(dt[0].title); 41 }, 42 error : function() { 43 alert("请求失败"); 44 }, 45 dataType : "json" 46 }); 47 var mydata = new Array(0); 48 for (var i = 0; i < dt.length; i++) { 49 var d = {}; 50 51 d["name"] = dt[i].name; 52 //alert(dt[i].name); 53 d["value"] = dt[i].value; 54 mydata.push(d); 55 } 56 var option = { 57 tooltip: {}, 58 series: [ { 59 type: 'wordCloud', 60 gridSize: 2, 61 sizeRange: [20, 50], 62 rotationRange: [-90, 90], 63 shape: 'pentagon', 64 600, 65 height: 300, 66 drawOutOfBound: true, 67 textStyle: { 68 normal: { 69 color: function () { 70 return 'rgb(' + [ 71 Math.round(Math.random() * 160), 72 Math.round(Math.random() * 160), 73 Math.round(Math.random() * 160) 74 ].join(',') + ')'; 75 } 76 }, 77 emphasis: { 78 shadowBlur: 10, 79 shadowColor: '#333' 80 } 81 }, 82 data: mydata 83 } ] 84 }; 85 86 chart.setOption(option); 87 chart.on('click', function (params) { 88 var url = "ClickServlet?geunjian=" + params.name; 89 window.location.href = url; 90 }); 91 window.onresize = chart.resize; 92 </script> 93 </body> 94 </html>
然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表
1 import java.io.IOException; 2 import java.sql.SQLException; 3 import java.util.ArrayList; 4 import java.util.List; 5 6 import javax.servlet.ServletException; 7 import javax.servlet.annotation.WebServlet; 8 import javax.servlet.http.HttpServlet; 9 import javax.servlet.http.HttpServletRequest; 10 import javax.servlet.http.HttpServletResponse; 11 12 import com.me.dao.LWDao; 13 import com.me.domain.LunWen; 14 15 /** 16 * Servlet implementation class ClickServlet 17 */ 18 @WebServlet("/ClickServlet") 19 public class ClickServlet extends HttpServlet { 20 private static final long serialVersionUID = 1L; 21 LWDao dao = new LWDao(); 22 23 public ClickServlet() { 24 super(); 25 // TODO Auto-generated constructor stub 26 } 27 28 protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 29 String geunjian = request.getParameter("geunjian"); 30 System.out.println(geunjian); 31 List<LunWen> guan = new ArrayList<LunWen>(); 32 try { 33 guan = dao.login(geunjian); 34 } catch (SQLException e) { 35 e.printStackTrace(); 36 } 37 for(int i=0;i<guan.size();i++) { 38 if(guan.get(i).getLianjie()!=null) { 39 String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length()); 40 guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); 41 } 42 43 } 44 request.setAttribute("list", guan); 45 System.out.println(guan.size()); 46 request.getRequestDispatcher("lw.jsp").forward(request, response); 47 } 48 49 /** 50 * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) 51 */ 52 protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 53 // TODO Auto-generated method stub 54 doGet(request, response); 55 } 56 57 }
然后嘞,运行一下就可以了
大佬的博客写的非常的详细可以参考一哈 https://www.cnblogs.com/20183544-wangzhengshuai/p/12702137.html