zoukankan      html  css  js  c++  java
  • 热词云

    我的队友是徐姣美 这是她的博客  https://home.cnblogs.com/u/xjmm/

    开始就是先从网站上面爬取相关信息  https://blog.csdn.net/u014636245/article/details/91426736  是在这个网站上面

    代码的爬取我是使用的py,下面是我的代码

      1 import requests
      2 from bs4 import BeautifulSoup
      3 import bs4
      4 # -*- coding: UTF-8 -*
      5 from urllib.request import urlopen
      6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf
      7 from pdfminer.converter import TextConverter
      8 from pdfminer.layout import LAParams
      9 from io import StringIO
     10 from pyhanlp import *
     11 import time
     12 
     13 import requests
     14 import json
     15 from pymysql import *
     16 
     17 #连接数据库的方法
     18 def connectDB():
     19     try:
     20         db=connect(host='localhost',port=3306,user='root',password='123456',db='python')
     21         print("数据库连接成功")
     22         return db
     23     except Exception as e:
     24         print(e)
     25     return NULL
     26 
     27 db = connectDB()
     28 
     29 #向数据库中插入数据的方法
     30 def insertInformation(title,abstract,keywords,href):
     31     cursor=db.cursor()
     32     try:
     33         cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href))
     34         print("插入成功")
     35         db.commit()
     36         cursor.close()
     37         return True
     38     except Exception as e:
     39         print(e)
     40         db.rollback()
     41     return False
     42 
     43 list_href=[]
     44 list_title=[]
     45 
     46 def getHtmlText(url):
     47     r = requests.get(url)
     48     r.raise_for_status()
     49     r.encoding = r.apparent_encoding
     50     html = r.text
     51     return html
     52 
     53 
     54 
     55 def getDataFromHtml(list,html):
     56     bs = BeautifulSoup(html, "lxml")
     57     for td in bs.tbody.find_all("td"):
     58         if isinstance(td,bs4.element.Tag):
     59             for a in td.find_all("a"):
     60                 list_href.append(a['href'])
     61                 list_title.append(a.text)
     62 
     63 def showAll(list):
     64     for univ in list:
     65         print(univ)
     66 
     67 
     68 def readPDF(pdfFile):
     69     rsrcmgr = PDFResourceManager()
     70     retstr = StringIO()
     71     laparams = LAParams()
     72     device = TextConverter(rsrcmgr, retstr, laparams=laparams)
     73     process_pdf(rsrcmgr, device, pdfFile)
     74     device.close()
     75     content = retstr.getvalue()
     76     retstr.close()
     77     return content
     78 
     79 if __name__ == '__main__':
     80     url = "https://blog.csdn.net/u014636245/article/details/91426736"
     81     try:
     82         html = getHtmlText(url)
     83         getDataFromHtml(list,html)
     84         for i in range(0,len(list_title)):
     85             print(i)
     86             pdfFile = urlopen(list_href[i])
     87             # 远程
     88             outputString = readPDF(pdfFile)
     89             if "Abstract" in outputString:
     90                 document = ""
     91                 if "1. Introduction" in outputString and "Abstract" in outputString:
     92                     document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")]
     93                 elif "1.Introduction" in outputString and "Abstract" in outputString:
     94                     document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")]
     95                 else :
     96                     document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800]
     97                 # print(document)
     98                 keywords = HanLP.extractKeyword(document, 10)
     99                 print(keywords)
    100                 str = ""
    101                 for k in keywords:
    102                     str+=k+" "
    103                 pdfFile.close()
    104                 insertInformation(list_title[i],document,str,list_href[i])
    105             time.sleep(0.1)
    106     except Exception as e:
    107         print(e)
    108         print("爬取失败")
    py

    然后爬取结束后是这个样子

    有很多很多条 ,关键词是每个keyword里面有10个关键词;

    然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;

    不要忘记将介词等无用词去掉;

    进行排序最简单的是使用的map

    // 排序
    List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
    //在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标
    Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
            return (left.getValue().compareTo(right.getValue()));
        }
    };
    // 集合默认升序升序
    Collections.sort(list,comparator);
    String ten[]=new String[50];
    int shu[]=new int[50];
    for(int i=0;i<50;i++){// 由高到低输出
        
     ten[i]=list.get(list.size()-i-1).getKey();
     shu[i]=list.get(list.size()-i-1).getValue();
     
     Tu tu =new Tu();
     tu.name=ten[i];
     tu.value=shu[i];
     list_tu.add(tu);
        System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
    }

    然后设置一个点击事件,转换成json的代码形式

    Gson gson = new Gson();
    String json = gson.toJson(list_tu);
    response.getWriter().write(json);

    然后使用echarts设计热词云

     1 <%@ page language="java" contentType="text/html; charset=UTF-8"
     2     pageEncoding="UTF-8"%>
     3 <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
     4 <!DOCTYPE html>
     5 <html>
     6 <head>
     7 <meta charset="UTF-8">
     8 <title>Insert title here</title>
     9 <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" />
    10 <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script>
    11 <script type="text/javascript" src="js/echarts.min.js"></script>
    12 <script type="text/javascript" src="js/china.js"></script>
    13 <script src="js/bootstrap.min.js" type="text/javascript"></script>
    14 <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
    15 <script src='js/echarts-wordcloud.js'></script>
    16 </head>
    17 <body>
    18 <div id="main" style=" 100%;height: 400px"></div>
    19 <div>
    20   <table class="table" style=" 100%;align-content: center;" >
    21     <tr>
    22       <th align="center">论文连接</th>
    23     </tr>
    24     <c:forEach var="item" items="${list}">
    25       <tr>
    26         <td><a href="${item.lianjie }">${item.title}</a></td>
    27       </tr>
    28     </c:forEach>
    29   </table>
    30 </div>
    31 <script>
    32   var chart = echarts.init(document.getElementById('main'));
    33   var dt;
    34   $.ajax({
    35     url : "PaperServlet_",
    36     async : false,
    37     type : "POST",
    38     success : function(data) {
    39       dt = data;
    40      // alert(dt[0].title);
    41     },
    42     error : function() {
    43       alert("请求失败");
    44     },
    45     dataType : "json"
    46   });
    47   var mydata = new Array(0);
    48   for (var i = 0; i < dt.length; i++) {
    49       var d = {};
    50       
    51       d["name"] = dt[i].name;
    52       //alert(dt[i].name);
    53       d["value"] = dt[i].value;
    54       mydata.push(d);
    55   }
    56   var option = {
    57     tooltip: {},
    58     series: [ {
    59       type: 'wordCloud',
    60       gridSize: 2,
    61       sizeRange: [20, 50],
    62       rotationRange: [-90, 90],
    63       shape: 'pentagon',
    64        600,
    65       height: 300,
    66       drawOutOfBound: true,
    67       textStyle: {
    68         normal: {
    69           color: function () {
    70             return 'rgb(' + [
    71               Math.round(Math.random() * 160),
    72               Math.round(Math.random() * 160),
    73               Math.round(Math.random() * 160)
    74             ].join(',') + ')';
    75           }
    76         },
    77         emphasis: {
    78           shadowBlur: 10,
    79           shadowColor: '#333'
    80         }
    81       },
    82       data: mydata
    83     } ]
    84   };
    85 
    86   chart.setOption(option);
    87   chart.on('click', function (params) {
    88       var url = "ClickServlet?geunjian=" + params.name;
    89       window.location.href = url;
    90     });
    91   window.onresize = chart.resize;
    92 </script>
    93 </body>
    94 </html>
    View Code

    然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表

     1 import java.io.IOException;
     2 import java.sql.SQLException;
     3 import java.util.ArrayList;
     4 import java.util.List;
     5 
     6 import javax.servlet.ServletException;
     7 import javax.servlet.annotation.WebServlet;
     8 import javax.servlet.http.HttpServlet;
     9 import javax.servlet.http.HttpServletRequest;
    10 import javax.servlet.http.HttpServletResponse;
    11 
    12 import com.me.dao.LWDao;
    13 import com.me.domain.LunWen;
    14 
    15 /**
    16  * Servlet implementation class ClickServlet
    17  */
    18 @WebServlet("/ClickServlet")
    19 public class ClickServlet extends HttpServlet {
    20     private static final long serialVersionUID = 1L;
    21     LWDao dao = new LWDao();
    22     
    23     public ClickServlet() {
    24         super();
    25         // TODO Auto-generated constructor stub
    26     }
    27 
    28     protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    29         String geunjian = request.getParameter("geunjian");
    30         System.out.println(geunjian);
    31         List<LunWen> guan = new ArrayList<LunWen>();
    32         try {
    33             guan = dao.login(geunjian);
    34         } catch (SQLException e) {
    35             e.printStackTrace();
    36         }
    37         for(int i=0;i<guan.size();i++) {
    38             if(guan.get(i).getLianjie()!=null) {
    39                 String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length());
    40                 guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss);
    41             }
    42             
    43         }
    44         request.setAttribute("list", guan);
    45         System.out.println(guan.size());
    46         request.getRequestDispatcher("lw.jsp").forward(request, response);
    47     }
    48 
    49     /**
    50      * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
    51      */
    52     protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    53         // TODO Auto-generated method stub
    54         doGet(request, response);
    55     }
    56 
    57 }
    View Code

    然后嘞,运行一下就可以了

    大佬的博客写的非常的详细可以参考一哈  https://www.cnblogs.com/20183544-wangzhengshuai/p/12702137.html

  • 相关阅读:
    使用grep搜索多个字符串
    Linux中如何启用root用户
    Docker Image 的发布和 Container 端口映射
    IIS负载均衡
    IIS负载均衡ARR前端请求到本地服务器和后端处理服务器
    IIS http重定向https,强制用户使用https访问的配置方法-iis设置
    IIS中应用Application Request Route 配置负载均衡
    IIS配置HTTPSIIS配置HTTPS
    asp.net用户登入验证
    高频交易建模
  • 原文地址:https://www.cnblogs.com/1234yyf/p/12715824.html
Copyright © 2011-2022 走看看