zoukankan      html  css  js  c++  java
  • 使用DFA算法对敏感词进行过滤

    项目目录结构如下:

    其中resources资源目录中:

    stopwd.txt :停顿词,匹配时间直接过滤。

    wd.txt:敏感词库。

    1、WordFilter敏感词过滤类:

      1 package com.skyer.sensitivewdfilter;
      2 
      3 import java.io.BufferedReader;
      4 import java.io.IOException;
      5 import java.io.InputStreamReader;
      6 import java.util.ArrayList;
      7 import java.util.HashMap;
      8 import java.util.HashSet;
      9 import java.util.List;
     10 import java.util.Map;
     11 import java.util.Set;
     12 
     13 /**
     14  * 思路: 创建一个FilterSet,枚举了0~65535的所有char是否是某个敏感词开头的状态
     15  * 
     16  * 判断是否是 敏感词开头 | | 是 不是 获取头节点 OK--下一个字 然后逐级遍历,DFA算法
     17  */
     18 public class WordFilter {
     19 
     20     private static final FilterSet set = new FilterSet(); // 存储首字
     21     private static final Map<Integer, WordNode> nodes = new HashMap<Integer, WordNode>(1024, 1); // 存储节点
     22     private static final Set<Integer> stopwdSet = new HashSet<Integer>(); // 停顿词
     23     private static final char SIGN = '*'; // 敏感词过滤替换
     24 
     25     static {
     26         try {
     27             long a = System.nanoTime();
     28             init();
     29             a = System.nanoTime() - a;
     30             System.out.println("加载时间 : " + a + "ns");
     31             System.out.println("加载时间 : " + a / 1000000 + "ms");
     32         } catch (Exception e) {
     33             throw new RuntimeException("初始化过滤器失败");
     34         }
     35     }
     36 
     37     private static void init() {
     38         // 获取敏感词
     39         addSensitiveWord(readWordFromFile("wd.txt"));
     40         addStopWord(readWordFromFile("stopwd.txt"));
     41     }
     42 
     43     /**
     44      * 增加敏感词
     45      */
     46     private static List<String> readWordFromFile(String path) {
     47         List<String> words;
     48         BufferedReader br = null;
     49         try {
     50             br = new BufferedReader(new InputStreamReader(WordFilter.class.getClassLoader().getResourceAsStream(path)));
     51             words = new ArrayList<String>(1200);
     52             for (String buf = ""; (buf = br.readLine()) != null;) {
     53                 if (buf == null || buf.trim().equals(""))
     54                     continue;
     55                 words.add(buf);
     56             }
     57         } catch (Exception e) {
     58             throw new RuntimeException(e);
     59         } finally {
     60             try {
     61                 if (br != null)
     62                     br.close();
     63             } catch (IOException e) {
     64             }
     65         }
     66         return words;
     67     }
     68 
     69     /**
     70      * 增加停顿词
     71      */
     72     private static void addStopWord(final List<String> words) {
     73         if (words != null && words.size() > 0) {
     74             char[] chs;
     75             for (String curr : words) {
     76                 chs = curr.toCharArray();
     77                 for (char c : chs) {
     78                     stopwdSet.add(charConvert(c));
     79                 }
     80             }
     81         }
     82     }
     83 
     84     /**
     85      * 添加DFA节点
     86      */
     87     private static void addSensitiveWord(final List<String> words) {
     88         if (words != null && words.size() > 0) {
     89             char[] chs;
     90             int fchar;
     91             int lastIndex;
     92             WordNode fnode; // 首字母节点
     93             for (String curr : words) {
     94                 chs = curr.toCharArray();
     95                 fchar = charConvert(chs[0]);
     96                 if (!set.contains(fchar)) {// 没有首字定义
     97                     set.add(fchar);// 首字标志位 可重复add
     98                     fnode = new WordNode(fchar, chs.length == 1);
     99                     nodes.put(fchar, fnode);
    100                 } else {
    101                     fnode = nodes.get(fchar);
    102                     if (!fnode.isLast() && chs.length == 1)
    103                         fnode.setLast(true);
    104                 }
    105                 lastIndex = chs.length - 1;
    106                 for (int i = 1; i < chs.length; i++) {
    107                     fnode = fnode.addIfNoExist(charConvert(chs[i]), i == lastIndex);
    108                 }
    109             }
    110         }
    111     }
    112 
    113     /**
    114      * 过滤判断 将敏感词转化为成屏蔽词
    115      */
    116     public static final String doFilter(final String src) {
    117         char[] chs = src.toCharArray();
    118         int length = chs.length;
    119         int currc;
    120         int k;
    121         WordNode node;
    122         for (int i = 0; i < length; i++) {
    123             currc = charConvert(chs[i]);
    124             if (!set.contains(currc)) {
    125                 continue;
    126             }
    127             node = nodes.get(currc);
    128             if (node == null)
    129                 continue;
    130             boolean couldMark = false;
    131             int markNum = -1;
    132             if (node.isLast()) {
    133                 couldMark = true;
    134                 markNum = 0;
    135             }
    136             k = i;
    137             for (; ++k < length;) {
    138                 int temp = charConvert(chs[k]);
    139                 if (stopwdSet.contains(temp))
    140                     continue;
    141                 node = node.querySub(temp);
    142                 if (node == null)
    143                     break;
    144                 if (node.isLast()) {
    145                     couldMark = true;
    146                     markNum = k - i;
    147                 }
    148             }
    149             if (couldMark) {
    150                 for (k = 0; k <= markNum; k++) {
    151                     chs[k + i] = SIGN;
    152                 }
    153                 i = i + markNum;
    154             }
    155         }
    156 
    157         return new String(chs);
    158     }
    159 
    160     /**
    161      * 是否包含敏感词
    162      */
    163     public static final boolean isContains(final String src) {
    164         char[] chs = src.toCharArray();
    165         int length = chs.length;
    166         int currc;
    167         int k;
    168         WordNode node;
    169         for (int i = 0; i < length; i++) {
    170             currc = charConvert(chs[i]);
    171             if (!set.contains(currc)) {
    172                 continue;
    173             }
    174             node = nodes.get(currc);
    175             if (node == null)
    176                 continue;
    177             boolean couldMark = false;
    178             if (node.isLast()) {
    179                 couldMark = true;
    180             }
    181             k = i;
    182             for (; ++k < length;) {
    183                 int temp = charConvert(chs[k]);
    184                 if (stopwdSet.contains(temp))
    185                     continue;
    186                 node = node.querySub(temp);
    187                 if (node == null)
    188                     break;
    189                 if (node.isLast()) {
    190                     couldMark = true;
    191                 }
    192             }
    193             if (couldMark) {
    194                 return true;
    195             }
    196         }
    197 
    198         return false;
    199     }
    200 
    201     /**
    202      * 大写转化为小写 全角转化为半角
    203      */
    204     private static int charConvert(char src) {
    205         int r = BCConvert.qj2bj(src);
    206         return (r >= 'A' && r <= 'Z') ? r + 32 : r;
    207     }
    208 
    209 }
    WordFilter.java

    其中:

          isContains :是否包含敏感词

         doFilter:过滤敏感词

    2、WordNode敏感词节点:

     1 package com.skyer.sensitivewdfilter;
     2 
     3 import java.util.LinkedList;
     4 import java.util.List;
     5 
     6 public class WordNode {
     7 
     8     private int value; // 节点名称
     9 
    10     private List<WordNode> subNodes; // 子节点
    11 
    12     private boolean isLast; // 默认false
    13 
    14     public WordNode(int value) {
    15         this.value = value;
    16     }
    17 
    18     public WordNode(int value, boolean isLast) {
    19         this.value = value;
    20         this.isLast = isLast;
    21     }
    22 
    23     /**
    24      * @return 就是传入的subNode
    25      */
    26     private WordNode addSubNode(final WordNode subNode) {
    27         if (subNodes == null)
    28             subNodes = new LinkedList<WordNode>();
    29         subNodes.add(subNode);
    30         return subNode;
    31     }
    32 
    33     /**
    34      * 有就直接返回该子节点, 没有就创建添加并返回该子节点
    35      */
    36     public WordNode addIfNoExist(final int value, final boolean isLast) {
    37         if (subNodes == null) {
    38             return addSubNode(new WordNode(value, isLast));
    39         }
    40         for (WordNode subNode : subNodes) {
    41             if (subNode.value == value) {
    42                 if (!subNode.isLast && isLast)
    43                     subNode.isLast = true;
    44                 return subNode;
    45             }
    46         }
    47         return addSubNode(new WordNode(value, isLast));
    48     }
    49 
    50     public WordNode querySub(final int value) {
    51         if (subNodes == null) {
    52             return null;
    53         }
    54         for (WordNode subNode : subNodes) {
    55             if (subNode.value == value)
    56                 return subNode;
    57         }
    58         return null;
    59     }
    60 
    61     public boolean isLast() {
    62         return isLast;
    63     }
    64 
    65     public void setLast(boolean isLast) {
    66         this.isLast = isLast;
    67     }
    68 
    69     @Override
    70     public int hashCode() {
    71         return value;
    72     }
    73 
    74 }
    WordNode.java

    3、测试类:

     1 package com.skyer.test;
     2 
     3 import org.junit.Test;
     4 
     5 import com.skyer.sensitivewdfilter.WordFilter;
     6 
     7 public class TestSensitivewd {
     8 
     9     @Test
    10     public void TestFilter() {
    11         String s = ""; // 这里写你要过滤的句子(我这里不能写,否则会给博客园屏蔽掉)
    12         System.out.println("解析问题: " + s);
    13         System.out.println("解析字数 : " + s.length());
    14         String re;
    15         long nano = System.nanoTime();
    16         re = WordFilter.doFilter(s);
    17         nano = (System.nanoTime() - nano);
    18         System.out.println("解析时间 : " + nano + "ns");
    19         System.out.println("解析时间 : " + nano / 1000000 + "ms");
    20         System.out.println(re);
    21         System.out.println();
    22 
    23         nano = System.nanoTime();
    24         System.out.println("是否包含敏感词: " + WordFilter.isContains(s));
    25         nano = (System.nanoTime() - nano);
    26         System.out.println("解析时间 : " + nano + "ns");
    27         System.out.println("解析时间 : " + nano / 1000000 + "ms");
    28     }
    29 
    30 }
    TestSensitivewd.java

    4、测试结果:

    原文参考:http://blog.csdn.net/fengshizty/article/details/52373005

    DFA知识:http://www.cnblogs.com/naaoveGIS/archive/2016/10/14/5960352.html

    作者:Oven
    个人网站:http://www.cloveaire.com
    个性签名:大亨以正,莫退初心!
    如果觉得这篇文章对你有帮助的话,记得在右下角点个“推荐”哦,博主在此感谢!
  • 相关阅读:
    共享一个从字符串转 Lambda 表达式的类(2)
    多个文件上传控件
    使用 SQL的 for xml path来进行字符串拼接
    数据结构之双向链表
    我的收藏颜色代码表
    C++中的字节对齐分析
    收藏sina播放器嵌入代码
    弃用数据库自增ID,曝光一下我自己用到的解决方法之终结篇
    google工作原理图
    easyicon一个非常好用的找图标的网站
  • 原文地址:https://www.cnblogs.com/Oven5217/p/6894695.html
Copyright © 2011-2022 走看看