zoukankan      html  css  js  c++  java
  • ikanalyzer分词,计算信息熵排序分词结果

    因需求,现需分词接口,故记录之。

    1、需要依赖:

    1 <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
    2         <dependency>
    3             <groupId>com.janeluo</groupId>
    4             <artifactId>ikanalyzer</artifactId>
    5             <version>2012_u6</version>
    6         </dependency>
    maven依赖

    2、完整代码如下:

      1 public JSONArray entropy(String content, Integer quantity) throws Exception {
      2         List<String> words = extract(DelHtmlTagUtil.delHTMLTag(content), quantity);
      3         JSONArray array = calculateWordEntropy(words);
      4         return array;
      5     }
      6 
      7 /**
      8      * 传入String类型的文章,智能提取单词放入list
      9      *
     10      * @param content  传入分词的内容
     11      * @param quantity 截取关键字在几个单词以上的数量,默认为1
     12      * @return
     13      */
     14     private List<String> extract(String content, Integer quantity) throws IOException {
     15         List<String> list = Lists.newArrayList();
     16         StringReader reader = new StringReader(content);
     17         IKSegmenter ik = new IKSegmenter(reader, true);
     18         Lexeme lex = null;
     19         while ((lex = ik.next()) != null) {
     20             //String typeString = lex.getLexemeTypeString();  词语类型
     21             String word = lex.getLexemeText();
     22             if (word.length() > quantity) {//判断截取关键字在几个单词以上的数量
     23                 list.add(word);
     24             }
     25         }
     26         return list;
     27     }
     28 
     29     private JSONArray calculateWordEntropy(List<String> words) throws Exception{
     30 
     31         int length = words.size();
     32         ArrayList<String[]> wordList = new ArrayList<String[]>();
     33         // 将分好的词每3个一组存到数组中
     34         for (int i = 0; i < length; i++) {
     35 
     36             String[] wordSeg = new String[3];
     37             if (i == 0) {
     38                 wordSeg[0] = "null";
     39                 wordSeg[1] = words.get(i);
     40                 wordSeg[2] = words.get(i + 1);
     41             } else if (i == length - 1) {
     42                 wordSeg[0] = words.get(i - 1);
     43                 wordSeg[1] = words.get(i);
     44                 wordSeg[2] = "null";
     45             } else {
     46                 wordSeg[0] = words.get(i - 1);
     47                 wordSeg[1] = words.get(i);
     48                 wordSeg[2] = words.get(i + 1);
     49             }
     50 
     51             wordList.add(wordSeg);
     52 
     53         }
     54         // 去除重复的词
     55         List<String> lists = Lists.newArrayList();
     56         for (int l = 0; l < length; l++) {
     57             lists.add(words.get(l));
     58         }
     59         List<String> tempList = Lists.newArrayList();
     60         for (String str : lists) {
     61             if (!(tempList.contains(str))) {
     62                 tempList.add(str);
     63             }
     64         }
     65         String[] wordClean = new String[tempList.size()];
     66         for (int m = 0; m < tempList.size(); m++) {
     67             wordClean[m] = tempList.get(m);
     68         }
     69         // 统计每个词的词频
     70         int[] frequent = new int[wordClean.length];
     71         for (int j = 0; j < wordClean.length; j++) {
     72             int count = 0;
     73             for (int k = 0; k < words.size(); k++) {
     74                 if (wordClean[j].equals(words.get(k))) {
     75                     count++;
     76                 }
     77             }
     78             frequent[j] = count;
     79         }
     80         // 将三元组中中间的那个词相同的存到一个list中,然后计算该词的信息熵
     81         double[] allEntropy = new double[wordClean.length];
     82         for (int n = 0; n < wordClean.length; n++) {
     83             ArrayList<String[]> wordSegList = new ArrayList<String[]>();
     84             int count = 1;
     85             for (int p = 0; p < wordList.size(); p++) {
     86                 String[] wordSegStr = wordList.get(p);
     87                 if (wordSegStr[1].equals(wordClean[n])) {
     88                     count++;
     89                     wordSegList.add(wordSegStr);
     90                 }
     91             }
     92             String[] leftword = new String[wordSegList.size()];
     93             String[] rightword = new String[wordSegList.size()];
     94             // 计算左信息熵
     95             for (int i = 0; i < wordSegList.size(); i++) {
     96                 String[] left = wordSegList.get(i);
     97                 leftword[i] = left[0];
     98             }
     99             // 去除左边重复的词
    100             List<String> listsLeft = new ArrayList<String>();
    101             for (int l = 0; l < leftword.length; l++) {
    102                 listsLeft.add(leftword[l]);
    103             }
    104             List<String> tempListLeft = new ArrayList<String>();
    105             for (String str : listsLeft) {
    106                 if (!(tempListLeft.contains(str))) {
    107                     tempListLeft.add(str);
    108                 }
    109             }
    110             String[] leftWordClean = new String[tempListLeft.size()];
    111             for (int m = 0; m < tempListLeft.size(); m++) {
    112                 leftWordClean[m] = tempListLeft.get(m);
    113             }
    114             // 统计左边每个词的词频
    115             int[] leftFrequent = new int[leftWordClean.length];
    116             for (int j = 0; j < leftWordClean.length; j++) {
    117                 int leftcount = 0;
    118                 for (int k = 0; k < leftword.length; k++) {
    119                     if (leftWordClean[j].equals(leftword[k])) {
    120                         leftcount++;
    121                     }
    122                 }
    123                 leftFrequent[j] = leftcount;
    124             }
    125             // 计算左熵值
    126             double leftEntropy = 0;
    127             for (int i = 0; i < leftFrequent.length; i++) {
    128                 double a = (double) leftFrequent[i] / count;
    129                 double b = Math.log((double) leftFrequent[i] / count);
    130                 leftEntropy += -a * b;
    131                 // leftEntropy +=
    132                 // (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count));
    133             }
    134             // 计算右信息熵
    135             for (int i = 0; i < wordSegList.size(); i++) {
    136                 String[] right = wordSegList.get(i);
    137                 rightword[i] = right[2];
    138             }
    139             // 去除右边重复的词
    140             List<String> listsRight = new ArrayList<String>();
    141             for (int l = 0; l < rightword.length; l++) {
    142                 listsRight.add(rightword[l]);
    143             }
    144             List<String> tempListRight = new ArrayList<String>();
    145             for (String str : listsRight) {
    146                 if (!(tempListRight.contains(str))) {
    147                     tempListRight.add(str);
    148                 }
    149             }
    150             String[] rightWordClean = new String[tempListRight.size()];
    151             for (int m = 0; m < tempListRight.size(); m++) {
    152                 rightWordClean[m] = tempListRight.get(m);
    153             }
    154             // 统计右边每个词的词频
    155             int[] rightFrequent = new int[rightWordClean.length];
    156             for (int j = 0; j < rightWordClean.length; j++) {
    157                 int rightcount = 0;
    158                 for (int k = 0; k < rightword.length; k++) {
    159                     if (rightWordClean[j].equals(rightword[k])) {
    160                         rightcount++;
    161                     }
    162                 }
    163                 rightFrequent[j] = rightcount;
    164             }
    165             // 计算右熵值
    166             double rightEntropy = 0.0;
    167             for (int i = 0; i < rightFrequent.length; i++) {
    168                 double a = (double) rightFrequent[i] / count;
    169                 double b = Math.log((double) rightFrequent[i] / count);
    170                 rightEntropy += -a * b;
    171                 // rightEntropy +=
    172                 // (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count));
    173             }
    174             // 计算词的总信息熵
    175             double wordEntropy = leftEntropy + rightEntropy;
    176             allEntropy[n] = wordEntropy;
    177 
    178         }
    179         JSONArray list = new JSONArray();
    180         for (int i = 0; i < allEntropy.length; i++) {
    181             JSONObject obj = new JSONObject();
    182             obj.put("name", wordClean[i]);
    183             obj.put("entropy", allEntropy[i]);
    184             list.add(obj);
    185         }
    186         Collections.sort(list, (o1, o2) -> {
    187             Double d1 = ((JSONObject) o1).getDouble("entropy");
    188             Double d2 = ((JSONObject) o2).getDouble("entropy");
    189             return d2.compareTo(d1);
    190         });
    191 
    192         return list;
    193     }
    处理代理
  • 相关阅读:
    5-python基础—获取某个目录下的文件列表(适用于任何系统)
    Automated, Self-Service Provisioning of VMs Using HyperForm (Part 1) (使用HyperForm自动配置虚拟机(第1部分)
    CloudStack Support in Apache libcloud(Apache libcloud中对CloudStack支持)
    Deploying MicroProfile-Based Java Apps to Bluemix(将基于MicroProfile的Java应用程序部署到Bluemix)
    Adding Persistent Storage to Red Hat CDK Kit 3.0 (在Red Hat CDK Kit 3.0添加永久性存储)
    Carve Your Laptop Into VMs Using Vagrant(使用Vagran把您笔记本电脑刻录成虚拟机)
    使用Python生成一张用于登陆验证的字符图片
    Jupyter notebook的安装方法
    Ubuntu16.04使用Anaconda5搭建TensorFlow使用环境 图文详细教程
    不同时区的换算
  • 原文地址:https://www.cnblogs.com/wiseroll/p/9360279.html
Copyright © 2011-2022 走看看