爬取代码:
1 import requests 2 from bs4 import BeautifulSoup 3 import bs4 4 # -*- coding: UTF-8 -* 5 from urllib.request import urlopen 6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf 7 from pdfminer.converter import TextConverter 8 from pdfminer.layout import LAParams 9 from io import StringIO 10 from pyhanlp import * 11 import time 12 13 import requests 14 import json 15 from pymysql import * 16 17 #连接数据库的方法 18 def connectDB(): 19 try: 20 db=connect(host='localhost',port=3306,user='root',password='123456',db='python') 21 print("数据库连接成功") 22 return db 23 except Exception as e: 24 print(e) 25 return NULL 26 27 db = connectDB() 28 29 #向数据库中插入数据的方法 30 def insertInformation(title,abstract,keywords,href): 31 cursor=db.cursor() 32 try: 33 cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href)) 34 print("插入成功") 35 db.commit() 36 cursor.close() 37 return True 38 except Exception as e: 39 print(e) 40 db.rollback() 41 return False 42 43 list_href=[] 44 list_title=[] 45 46 def getHtmlText(url): 47 r = requests.get(url) 48 r.raise_for_status() 49 r.encoding = r.apparent_encoding 50 html = r.text 51 return html 52 53 54 55 def getDataFromHtml(list,html): 56 bs = BeautifulSoup(html, "lxml") 57 for td in bs.tbody.find_all("td"): 58 if isinstance(td,bs4.element.Tag): 59 for a in td.find_all("a"): 60 list_href.append(a['href']) 61 list_title.append(a.text) 62 63 def showAll(list): 64 for univ in list: 65 print(univ) 66 67 68 def readPDF(pdfFile): 69 rsrcmgr = PDFResourceManager() 70 retstr = StringIO() 71 laparams = LAParams() 72 device = TextConverter(rsrcmgr, retstr, laparams=laparams) 73 process_pdf(rsrcmgr, device, pdfFile) 74 device.close() 75 content = retstr.getvalue() 76 retstr.close() 77 return content 78 79 if __name__ == '__main__': 80 url = "https://blog.csdn.net/u014636245/article/details/91426736" 81 try: 82 html = getHtmlText(url) 83 getDataFromHtml(list,html) 84 for i in range(0,len(list_title)): 85 print(i) 86 pdfFile = urlopen(list_href[i]) 87 # 远程 88 outputString = readPDF(pdfFile) 89 if "Abstract" in outputString: 90 document = "" 91 if "1. Introduction" in outputString and "Abstract" in outputString: 92 document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")] 93 elif "1.Introduction" in outputString and "Abstract" in outputString: 94 document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")] 95 else : 96 document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800] 97 # print(document) 98 keywords = HanLP.extractKeyword(document, 10) 99 print(keywords) 100 str = "" 101 for k in keywords: 102 str+=k+" " 103 pdfFile.close() 104 insertInformation(list_title[i],document,str,list_href[i]) 105 time.sleep(0.1) 106 except Exception as e: 107 print(e) 108 print("爬取失败") py
结果:
有很多很多条 ,关键词是每个keyword里面有10个关键词;
然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;
不要忘记将介词等无用词去掉;
进行排序最简单的是使用的map
// 排序 List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet()); //在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标 Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) { return (left.getValue().compareTo(right.getValue())); } }; // 集合默认升序升序 Collections.sort(list,comparator); String ten[]=new String[50]; int shu[]=new int[50]; for(int i=0;i<50;i++){// 由高到低输出 ten[i]=list.get(list.size()-i-1).getKey(); shu[i]=list.get(list.size()-i-1).getValue(); Tu tu =new Tu(); tu.name=ten[i]; tu.value=shu[i]; list_tu.add(tu); System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue()); }
然后设置一个点击事件,转换成json的代码形式
Gson gson = new Gson(); String json = gson.toJson(list_tu); response.getWriter().write(json);
然后使用echarts设计热词云
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>Insert title here</title> <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" /> <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script> <script type="text/javascript" src="js/echarts.min.js"></script> <script type="text/javascript" src="js/china.js"></script> <script src="js/bootstrap.min.js" type="text/javascript"></script> <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> <script src='js/echarts-wordcloud.js'></script> </head> <body> <div id="main" style=" 100%;height: 400px"></div> <div> <table class="table" style=" 100%;align-content: center;" > <tr> <th align="center">论文连接</th> </tr> <c:forEach var="item" items="${list}"> <tr> <td><a href="${item.lianjie }">${item.title}</a></td> </tr> </c:forEach> </table> </div> <script> var chart = echarts.init(document.getElementById('main')); var dt; $.ajax({ url : "PaperServlet_", async : false, type : "POST", success : function(data) { dt = data; // alert(dt[0].title); }, error : function() { alert("请求失败"); }, dataType : "json" }); var mydata = new Array(0); for (var i = 0; i < dt.length; i++) { var d = {}; d["name"] = dt[i].name; //alert(dt[i].name); d["value"] = dt[i].value; mydata.push(d); } var option = { tooltip: {}, series: [ { type: 'wordCloud', gridSize: 2, sizeRange: [20, 50], rotationRange: [-90, 90], shape: 'pentagon', 600, height: 300, drawOutOfBound: true, textStyle: { normal: { color: function () { return 'rgb(' + [ Math.round(Math.random() * 160), Math.round(Math.random() * 160), Math.round(Math.random() * 160) ].join(',') + ')'; } }, emphasis: { shadowBlur: 10, shadowColor: '#333' } }, data: mydata } ] }; chart.setOption(option); chart.on('click', function (params) { var url = "ClickServlet?geunjian=" + params.name; window.location.href = url; }); window.onresize = chart.resize; </script> </body> </html>
然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表
import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.me.dao.LWDao; import com.me.domain.LunWen; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; LWDao dao = new LWDao(); public ClickServlet() { super(); // TODO Auto-generated constructor stub } protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { String geunjian = request.getParameter("geunjian"); System.out.println(geunjian); List<LunWen> guan = new ArrayList<LunWen>(); try { guan = dao.login(geunjian); } catch (SQLException e) { e.printStackTrace(); } for(int i=0;i<guan.size();i++) { if(guan.get(i).getLianjie()!=null) { String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length()); guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss); } } request.setAttribute("list", guan); System.out.println(guan.size()); request.getRequestDispatcher("lw.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }