zoukankan      html  css  js  c++  java
  • C# 敏感词过滤

        public class BadWordFilter

        {

            #region 变量

            private HashSet<string> hash = new HashSet<string>();

            private byte[] fastCheck = new byte[char.MaxValue];

            private byte[] fastLength = new byte[char.MaxValue];

            private BitArray charCheck = new BitArray(char.MaxValue);

            private BitArray endCheck = new BitArray(char.MaxValue);

            private int maxWordLength = 0;

            private int minWordLength = int.MaxValue;

            private string _replaceString = "*";

            private string _newWord;

            #endregion

     

            #region 单例模式创建实例

            private static BadWordFilter badWordFilter = null;

     

            /// <summary>

            /// 构造函数

            /// </summary>

            private BadWordFilter() { }

     

            /// <summary>

            /// 单例

            /// </summary>

            /// <returns></returns>

            public static BadWordFilter CreateBadWordsFilter()

            {

                if (badWordFilter == null)

                {

                    badWordFilter = new BadWordFilter();

                }

                return badWordFilter;

            }

            #endregion

     

            #region 初始化数据,将List集合类型敏感词放入HashSet中

            /// <summary>

            /// 初始化数据,将敏感词放入HashSet中

            /// </summary>

            /// <param name="badwords"></param>

            public void Init(List<BadWordEntity> badwords)

            {

                foreach (BadWordEntity word in badwords)

                {

                    maxWordLength = Math.Max(maxWordLength, word.BadWord.Length);

                    minWordLength = Math.Min(minWordLength, word.BadWord.Length);

                    for (int i = 0; i < 7 && i < word.BadWord.Length; i++)

                    {

                        fastCheck[word.BadWord[i]] |= (byte)(1 << i);

                    }

                    for (int i = 7; i < word.BadWord.Length; i++)

                    {

                        fastCheck[word.BadWord[i]] |= 0x80;

                    }

                    if (word.BadWord.Length == 1)

                    {

                        charCheck[word.BadWord[0]] = true;

                    }

                    else

                    {

                        fastLength[word.BadWord[0]] |= (byte)(1 << (Math.Min(7, word.BadWord.Length - 2)));

                        endCheck[word.BadWord[word.BadWord.Length - 1]] = true;

     

                        hash.Add(word.BadWord);

                    }

                }

            }

            #endregion

     

            #region 初始化数据,将String[]类型敏感词放入HashSet中

            /// <summary>

            /// 初始化数据,将敏感词放入HashSet中

            /// </summary>

            /// <param name="badwords"></param>

            private void Init(string[] badwords)

            {

                foreach (string word in badwords)

                {

                    maxWordLength = Math.Max(maxWordLength, word.Length);

                    minWordLength = Math.Min(minWordLength, word.Length);

                    for (int i = 0; i < 7 && i < word.Length; i++)

                    {

                        fastCheck[word[i]] |= (byte)(1 << i);

                    }

                    for (int i = 7; i < word.Length; i++)

                    {

                        fastCheck[word[i]] |= 0x80;

                    }

                    if (word.Length == 1)

                    {

                        charCheck[word[0]] = true;

                    }

                    else

                    {

                        fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));

                        endCheck[word[word.Length - 1]] = true;

     

                        hash.Add(word);

                    }

                }

            }

            #endregion

     

            #region 检查是否有敏感词

            /// <summary>

            /// 检查是否有敏感词

            /// </summary>

            /// <param name="text"></param>

            /// <returns></returns>

            public bool HasBadWord(string text)

            {

                int index = 0;

                while (index < text.Length)

                {

                    int count = 1;

                    if (index > 0 || (fastCheck[text[index]] & 1) == 0)

                    {

                        while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;

                    }

     

                    char begin = text[index];

                    if (minWordLength == 1 && charCheck[begin])

                    {

                        return true;

                    }

     

                    for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)

                    {

                        char current = text[index + j];

     

                        if ((fastCheck[current] & 1) == 0)

                        {

                            ++count;

                        }

     

                        if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)

                        {

                            break;

                        }

     

                        if (j + 1 >= minWordLength)

                        {

                            if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])

                            {

                                string sub = text.Substring(index, j + 1);

     

                                if (hash.Contains(sub))

                                {

                                    return true;

                                }

                            }

                        }

                    }

     

                    index += count;

                }

     

                return false;

            }

            #endregion

     

            #region 替换敏感词

            /// <summary>

            /// 替换敏感词

            /// </summary>

            /// <param name="text"></param>

            /// <returns></returns>

            public string ReplaceBadWord(string text)

            {

                int index = 0;

     

                for (index = 0; index < text.Length; index++)

                {

                    if ((fastCheck[text[index]] & 1) == 0)

                    {

                        while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;

                    }

     

                    //单字节检测

                    if (minWordLength == 1 && charCheck[text[index]])

                    {

                        text = text.Replace(text[index], _replaceString[0]);

                        continue;

                    }

                    //多字节检测

                    for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)

                    {

                        //快速排除

                        if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)

                        {

                            break;

                        }

     

                        if (j + 1 >= minWordLength)

                        {

                            string sub = text.Substring(index, j + 1);

     

                            if (hash.Contains(sub))

                            {

     

                                //替换字符操作

                                char cc = _replaceString[0];

                                string rp = _replaceString.PadRight((j + 1), cc);

                                text = text.Replace(sub, rp);

                                //记录新位置

                                index += j;

                                break;

                            }

                        }

                    }

                }

                _newWord = text;

                return text;

            }

            #endregion

        }

     

        #region 敏感词实体类

        /// <summary>

        /// 敏感词实体

        /// </summary>

        public class BadWordEntity

        {

            /// <summary>

            /// 敏感词

            /// </summary>

            public string BadWord { get; set; }

        }

        #endregion

  • 相关阅读:
    java基础语法
    MySQL5.7常用命令
    wireshark抓包分析---TCP/IP协议
    MySQL安全管理
    MySQL触发器
    MySQL存储过程和游标
    mysql实现远程登录
    Java中遍历Map对象的4种方法
    SSM-CRUD
    SSM整合-配置文件
  • 原文地址:https://www.cnblogs.com/liuyl/p/4276467.html
Copyright © 2011-2022 走看看