zoukankan      html  css  js  c++  java
  • 过滤敏感字DFA JAVA实现

    package com.member.schedule;
    
    import java.io.UnsupportedEncodingException;
    import java.nio.ByteBuffer;
    import java.nio.charset.Charset;
    import java.util.List;
    
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    /**
     * @author 徐良永
     * @Date 2011-10-13 上午9:23:43
     */
    public class DFA {
    
        private static final Logger logger = LoggerFactory.getLogger(DFA.class);
    
        /**
         * 根节点
         */
        private TreeNode rootNode = new TreeNode();
    
        /**
         * 关键词缓存
         */
        private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024);
    
        /**
         * 关键词编码
         */
        private String charset = "GBK";
    
        /**
         * 创建DFA
         * 
         * @param keywordList
         * @throws UnsupportedEncodingException
         */
        public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException {
    
            for (String keyword : keywordList) {
                if (keyword == null)
                    continue;
                keyword = keyword.trim();
                byte[] bytes = keyword.getBytes(charset);
    
                TreeNode tempNode = rootNode;
                // 循环每个字节
                for (int i = 0; i < bytes.length; i++) {
                    int index = bytes[i] & 0xff; // 字符转换成数字
                    TreeNode node = tempNode.getSubNode(index);
    
                    if (node == null) { // 没初始化
                        node = new TreeNode();
                        tempNode.setSubNode(index, node);
                    }
    
                    tempNode = node;
    
                    if (i == bytes.length - 1) {
                        tempNode.setKeywordEnd(true); // 关键词结束, 设置结束标志
                        logger.debug("DFA:{}", keyword);
                    }
                } // end for
            } // end for
    
        }
    
        /**
         * 搜索关键字
         */
        public String searchKeyword(String text) throws UnsupportedEncodingException {
            return searchKeyword(text.getBytes(charset));
        }
    
        /**
         * 搜索关键字
         */
        public String searchKeyword(byte[] bytes) {
            StringBuilder words = new StringBuilder();
    
            if (bytes == null || bytes.length == 0) {
                return words.toString();
            }
    
            TreeNode tempNode = rootNode;
            int rollback = 0; // 回滚数
            int position = 0; // 当前比较的位置
    
            while (position < bytes.length) {
                int index = bytes[position] & 0xFF;
                keywordBuffer.put(bytes[position]); // 写关键词缓存
                tempNode = tempNode.getSubNode(index);
    
                // 当前位置的匹配结束
                if (tempNode == null) {
                    position = position - rollback; // 回退 并测试下一个字节
                    rollback = 0;
                    tempNode = rootNode; // 状态机复位
                    keywordBuffer.clear(); // 清空
                } else if (tempNode.isKeywordEnd()) { // 是结束点 记录关键词
                    keywordBuffer.flip();
                    String keyword = Charset.forName(charset).decode(keywordBuffer).toString();
                    logger.debug("Find keyword:{}", keyword);
                    keywordBuffer.limit(keywordBuffer.capacity());
    
                    if (words.length() == 0)
                        words.append(keyword);
                    else
                        words.append(":").append(keyword);
    
                    rollback = 1; // 遇到结束点 rollback 置为1
                } else {
                    rollback++; // 非结束点 回退数加1
                }
    
                position++;
            }
    
            return words.toString();
        }
    
        public void setCharset(String charset) {
            this.charset = charset;
        }
    
    }
    package com.member.schedule;
    
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * 树节点 每个节点包含一个长度为256的数组
     * 
     * @author 徐良永
     * @Date 2011-10-12 上午3:11:24
     */
    public class TreeNode {
    
        private static final int NODE_LEN = 256;
    
        /**
         * true 关键词的终结 ; false 继续
         */
        private boolean end = false;
    
        private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);
    
        public TreeNode() {
            for (int i = 0; i < NODE_LEN; i++) {
                subNodes.add(i, null);
            }
        }
    
        /**
         * 向指定位置添加节点树
         * 
         * @param index
         * @param node
         */
        public void setSubNode(int index, TreeNode node) {
            subNodes.set(index, node);
        }
    
        public TreeNode getSubNode(int index) {
            return subNodes.get(index);
        }
    
        public boolean isKeywordEnd() {
            return end;
        }
    
        public void setKeywordEnd(boolean end) {
            this.end = end;
        }
    
    }

    link:http://www.iteye.com/topic/1116520

  • 相关阅读:
    杭电 1521 排列组合

    杭电 1799 循环多少次?
    杭电1028 Ignatius and the Princess III(整数拆分)
    毕业论文A.1 Matlab环境配置
    Open Graphics Library初步_搭建环境_GLUT
    Open Graphics Library初步_摄影机_GLUT
    C#使用office相关com组件
    插入排序
    二叉树的四种遍历方法(递归、迭代)
  • 原文地址:https://www.cnblogs.com/jpit/p/7351903.html
Copyright © 2011-2022 走看看