zoukankan      html  css  js  c++  java
  • ikanalyzer分词,计算信息熵排序分词结果

    因需求,现需分词接口,故记录之。

    1、需要依赖:

    1 <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
    2         <dependency>
    3             <groupId>com.janeluo</groupId>
    4             <artifactId>ikanalyzer</artifactId>
    5             <version>2012_u6</version>
    6         </dependency>
    maven依赖

    2、完整代码如下:

      1 public JSONArray entropy(String content, Integer quantity) throws Exception {
      2         List<String> words = extract(DelHtmlTagUtil.delHTMLTag(content), quantity);
      3         JSONArray array = calculateWordEntropy(words);
      4         return array;
      5     }
      6 
      7 /**
      8      * 传入String类型的文章,智能提取单词放入list
      9      *
     10      * @param content  传入分词的内容
     11      * @param quantity 截取关键字在几个单词以上的数量,默认为1
     12      * @return
     13      */
     14     private List<String> extract(String content, Integer quantity) throws IOException {
     15         List<String> list = Lists.newArrayList();
     16         StringReader reader = new StringReader(content);
     17         IKSegmenter ik = new IKSegmenter(reader, true);
     18         Lexeme lex = null;
     19         while ((lex = ik.next()) != null) {
     20             //String typeString = lex.getLexemeTypeString();  词语类型
     21             String word = lex.getLexemeText();
     22             if (word.length() > quantity) {//判断截取关键字在几个单词以上的数量
     23                 list.add(word);
     24             }
     25         }
     26         return list;
     27     }
     28 
     29     private JSONArray calculateWordEntropy(List<String> words) throws Exception{
     30 
     31         int length = words.size();
     32         ArrayList<String[]> wordList = new ArrayList<String[]>();
     33         // 将分好的词每3个一组存到数组中
     34         for (int i = 0; i < length; i++) {
     35 
     36             String[] wordSeg = new String[3];
     37             if (i == 0) {
     38                 wordSeg[0] = "null";
     39                 wordSeg[1] = words.get(i);
     40                 wordSeg[2] = words.get(i + 1);
     41             } else if (i == length - 1) {
     42                 wordSeg[0] = words.get(i - 1);
     43                 wordSeg[1] = words.get(i);
     44                 wordSeg[2] = "null";
     45             } else {
     46                 wordSeg[0] = words.get(i - 1);
     47                 wordSeg[1] = words.get(i);
     48                 wordSeg[2] = words.get(i + 1);
     49             }
     50 
     51             wordList.add(wordSeg);
     52 
     53         }
     54         // 去除重复的词
     55         List<String> lists = Lists.newArrayList();
     56         for (int l = 0; l < length; l++) {
     57             lists.add(words.get(l));
     58         }
     59         List<String> tempList = Lists.newArrayList();
     60         for (String str : lists) {
     61             if (!(tempList.contains(str))) {
     62                 tempList.add(str);
     63             }
     64         }
     65         String[] wordClean = new String[tempList.size()];
     66         for (int m = 0; m < tempList.size(); m++) {
     67             wordClean[m] = tempList.get(m);
     68         }
     69         // 统计每个词的词频
     70         int[] frequent = new int[wordClean.length];
     71         for (int j = 0; j < wordClean.length; j++) {
     72             int count = 0;
     73             for (int k = 0; k < words.size(); k++) {
     74                 if (wordClean[j].equals(words.get(k))) {
     75                     count++;
     76                 }
     77             }
     78             frequent[j] = count;
     79         }
     80         // 将三元组中中间的那个词相同的存到一个list中,然后计算该词的信息熵
     81         double[] allEntropy = new double[wordClean.length];
     82         for (int n = 0; n < wordClean.length; n++) {
     83             ArrayList<String[]> wordSegList = new ArrayList<String[]>();
     84             int count = 1;
     85             for (int p = 0; p < wordList.size(); p++) {
     86                 String[] wordSegStr = wordList.get(p);
     87                 if (wordSegStr[1].equals(wordClean[n])) {
     88                     count++;
     89                     wordSegList.add(wordSegStr);
     90                 }
     91             }
     92             String[] leftword = new String[wordSegList.size()];
     93             String[] rightword = new String[wordSegList.size()];
     94             // 计算左信息熵
     95             for (int i = 0; i < wordSegList.size(); i++) {
     96                 String[] left = wordSegList.get(i);
     97                 leftword[i] = left[0];
     98             }
     99             // 去除左边重复的词
    100             List<String> listsLeft = new ArrayList<String>();
    101             for (int l = 0; l < leftword.length; l++) {
    102                 listsLeft.add(leftword[l]);
    103             }
    104             List<String> tempListLeft = new ArrayList<String>();
    105             for (String str : listsLeft) {
    106                 if (!(tempListLeft.contains(str))) {
    107                     tempListLeft.add(str);
    108                 }
    109             }
    110             String[] leftWordClean = new String[tempListLeft.size()];
    111             for (int m = 0; m < tempListLeft.size(); m++) {
    112                 leftWordClean[m] = tempListLeft.get(m);
    113             }
    114             // 统计左边每个词的词频
    115             int[] leftFrequent = new int[leftWordClean.length];
    116             for (int j = 0; j < leftWordClean.length; j++) {
    117                 int leftcount = 0;
    118                 for (int k = 0; k < leftword.length; k++) {
    119                     if (leftWordClean[j].equals(leftword[k])) {
    120                         leftcount++;
    121                     }
    122                 }
    123                 leftFrequent[j] = leftcount;
    124             }
    125             // 计算左熵值
    126             double leftEntropy = 0;
    127             for (int i = 0; i < leftFrequent.length; i++) {
    128                 double a = (double) leftFrequent[i] / count;
    129                 double b = Math.log((double) leftFrequent[i] / count);
    130                 leftEntropy += -a * b;
    131                 // leftEntropy +=
    132                 // (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count));
    133             }
    134             // 计算右信息熵
    135             for (int i = 0; i < wordSegList.size(); i++) {
    136                 String[] right = wordSegList.get(i);
    137                 rightword[i] = right[2];
    138             }
    139             // 去除右边重复的词
    140             List<String> listsRight = new ArrayList<String>();
    141             for (int l = 0; l < rightword.length; l++) {
    142                 listsRight.add(rightword[l]);
    143             }
    144             List<String> tempListRight = new ArrayList<String>();
    145             for (String str : listsRight) {
    146                 if (!(tempListRight.contains(str))) {
    147                     tempListRight.add(str);
    148                 }
    149             }
    150             String[] rightWordClean = new String[tempListRight.size()];
    151             for (int m = 0; m < tempListRight.size(); m++) {
    152                 rightWordClean[m] = tempListRight.get(m);
    153             }
    154             // 统计右边每个词的词频
    155             int[] rightFrequent = new int[rightWordClean.length];
    156             for (int j = 0; j < rightWordClean.length; j++) {
    157                 int rightcount = 0;
    158                 for (int k = 0; k < rightword.length; k++) {
    159                     if (rightWordClean[j].equals(rightword[k])) {
    160                         rightcount++;
    161                     }
    162                 }
    163                 rightFrequent[j] = rightcount;
    164             }
    165             // 计算右熵值
    166             double rightEntropy = 0.0;
    167             for (int i = 0; i < rightFrequent.length; i++) {
    168                 double a = (double) rightFrequent[i] / count;
    169                 double b = Math.log((double) rightFrequent[i] / count);
    170                 rightEntropy += -a * b;
    171                 // rightEntropy +=
    172                 // (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count));
    173             }
    174             // 计算词的总信息熵
    175             double wordEntropy = leftEntropy + rightEntropy;
    176             allEntropy[n] = wordEntropy;
    177 
    178         }
    179         JSONArray list = new JSONArray();
    180         for (int i = 0; i < allEntropy.length; i++) {
    181             JSONObject obj = new JSONObject();
    182             obj.put("name", wordClean[i]);
    183             obj.put("entropy", allEntropy[i]);
    184             list.add(obj);
    185         }
    186         Collections.sort(list, (o1, o2) -> {
    187             Double d1 = ((JSONObject) o1).getDouble("entropy");
    188             Double d2 = ((JSONObject) o2).getDouble("entropy");
    189             return d2.compareTo(d1);
    190         });
    191 
    192         return list;
    193     }
    处理代理
  • 相关阅读:
    Docker 相关资源
    ReferenceFinder 学习
    AssetBundle 复习
    Coroutine 复习
    Unreal 相关资源
    Houdini 相关资源
    MySQL多数据库合并到一个数据库
    Module is not specified
    idea导入多maven项目
    redis-cluster部署遇到为问题记录
  • 原文地址:https://www.cnblogs.com/wiseroll/p/9360279.html
Copyright © 2011-2022 走看看