zoukankan      html  css  js  c++  java
  • web系统安全运营之基础- 基于DFA算法的高性能的敏感词,脏词的检测过滤算法类(c#).

    【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词..  这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。

     废话少说,先看下代码,可以拿过去直接使用。

      1 using Microsoft.VisualBasic;
      2 using System;
      3 using System.Collections.Generic;
      4 using System.IO;
      5 using System.Linq;
      6 using System.Text;
      7 
      8 namespace OpenCore.ContentSecurity
      9 {
     10     /// <summary>
     11     /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容)..之所以高效,因为本算法对主输入的字符串,只循环了一次。 无需对词库的每个词进行replace的低效率处理。
     12     /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197
     13     /// 更新日志:
     14     ///          2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能.
     15     ///                          支持多词库文件加载.
     16     ///                          优化了算法的细节,提高健壮性。
     17     /// </summary>
     18     public class SensitiveWordFilter
     19     {
     20         private static string[] dictionaryPathList = null;
     21         /// <summary>
     22         /// 内存词典
     23         /// </summary>
     24         private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
     25         private static object lockObj = new object();
     26         public static void Init(string[] sDictionaryFileName)
     27         {
     28             dictionaryPathList = sDictionaryFileName;
     29             LoadDictionary();
     30         }
     31         public SensitiveWordFilter()
     32         {
     33 
     34         }
     35         private string sourctText = string.Empty;
     36         /// <summary>
     37         /// 检测源
     38         /// </summary>
     39         private string SourctText
     40         {
     41             get { return sourctText; }
     42             set { sourctText = value; }
     43         }
     44         /// <summary>
     45         /// 检测源游标
     46         /// </summary>
     47         private int cursor = 0;
     48         /// <summary>
     49         /// 匹配成功后偏移量
     50         /// </summary>
     51         private int wordlenght = 0;
     52         /// <summary>
     53         /// 检测词游标
     54         /// </summary>
     55         private int nextCursor = 0;
     56         private List<string> illegalWords = new List<string>();
     57         /// <summary>
     58         /// 检测到的非法词集
     59         /// </summary>
     60         public List<string> IllegalWords
     61         {
     62             get { return illegalWords; }
     63         }
     64         /// <summary>
     65         /// 判断是否是中文
     66         /// </summary>
     67         /// <param name="character"></param>
     68         /// <returns></returns>
     69         private bool isCHS(char character)
     70         {
     71             //  中文表意字符的范围 4E00-9FA5
     72             int charVal = (int)character;
     73             return (charVal >= 0x4e00 && charVal <= 0x9fa5);
     74         }
     75         /// <summary>
     76         /// 判断是否是数字
     77         /// </summary>
     78         /// <param name="character"></param>
     79         /// <returns></returns>
     80         private bool isNum(char character)
     81         {
     82             int charVal = (int)character;
     83             return (charVal >= 48 && charVal <= 57);
     84         }
     85         /// <summary>
     86         /// 判断是否是字母
     87         /// </summary>
     88         /// <param name="character"></param>
     89         /// <returns></returns>
     90         private bool isAlphabet(char character)
     91         {
     92             int charVal = (int)character;
     93             return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
     94         }
     95         /// <summary>
     96         /// 转半角小写的函数(DBC case)
     97         /// </summary>
     98         /// <param name="input">任意字符串</param>
     99         /// <returns>半角字符串</returns>
    100         ///<remarks>
    101         ///全角空格为12288,半角空格为32
    102         ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
    103         ///</remarks>
    104         private static string ToDBC(string input)
    105         {
    106             char[] c = input.ToCharArray();
    107             for (int i = 0; i < c.Length; i++)
    108             {
    109                 if (c[i] == 12288)
    110                 {
    111                     c[i] = (char)32;
    112                     continue;
    113                 }
    114                 if (c[i] > 65280 && c[i] < 65375)
    115                     c[i] = (char)(c[i] - 65248);
    116             }
    117             return new string(c).ToLower();
    118         }
    119         /// <summary>
    120         /// 转换为简体中文
    121         /// </summary>
    122         /// <param name="sInput"></param>
    123         /// <returns></returns>
    124         private static string ToSimplifiedChiniese(string sInput)
    125         {
    126             if (string.IsNullOrEmpty(sInput))
    127             {
    128                 return string.Empty;
    129             }
    130             try
    131             {
    132                 return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0);
    133             }
    134             catch (Exception ex)
    135             {
    136 
    137             }
    138             return sInput;
    139         }
    140         /// <summary>
    141         /// 写入日志(非跨程序域的场景)
    142         /// </summary>
    143         /// <param name="Msg"></param>
    144         private static void SaveLog(string Msg)
    145         {
    146             string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog");
    147             if (!Directory.Exists(sPath))
    148             {
    149                 Directory.CreateDirectory(sPath);
    150             }
    151             sPath = string.Format("{0}\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log");
    152             try
    153             {
    154                 File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "
    ");
    155             }
    156             catch
    157             {
    158             }
    159         }
    160         /// <summary>
    161         /// 加载内存词库
    162         /// </summary>
    163         private static void LoadDictionary()
    164         {
    165             if (dictionaryPathList == null || dictionaryPathList.Length == 0)
    166             {
    167                 SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空");
    168                 return;
    169             }
    170             foreach (string sFileName in dictionaryPathList)
    171             {
    172                 if (File.Exists(sFileName) == false)
    173                 {
    174                     SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件");
    175                     return;
    176                 }
    177             }
    178             List<string> wordList = new List<string>();
    179             Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
    180             foreach (string sDictionaryFile in dictionaryPathList)
    181             {
    182                 string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default);
    183                 foreach (string word in words)
    184                 {
    185                     if (string.IsNullOrEmpty(word))
    186                         continue;
    187                     if (word.Trim().Length == 0)
    188                         continue;
    189                     string key = ToDBC(word);
    190                     wordList.Add(key);
    191                     //适配繁体,简体.addbyww@2020-4-15
    192                     string key_simple = ToSimplifiedChiniese(key);
    193                     if (key_simple != key)
    194                     {
    195                         wordList.Add(key_simple);
    196                     }
    197                 }
    198             }
    199             Comparison<string> cmp = delegate (string key1, string key2)
    200             {
    201                 return key1.CompareTo(key2);
    202             };
    203             wordList.Sort(cmp);
    204             for (int i = wordList.Count - 1; i > 0; i--)
    205             {
    206                 if (wordList[i].ToString() == wordList[i - 1].ToString())
    207                 {
    208                     wordList.RemoveAt(i);
    209                 }
    210             }
    211             foreach (var word in wordList)
    212             {
    213                 if (word.Length > 0)
    214                 {
    215                     WordGroup group = MEMORYLEXICON[(int)word[0]];
    216                     if (group == null)
    217                     {
    218                         group = new WordGroup();
    219                         MEMORYLEXICON[(int)word[0]] = group;
    220                     }
    221                     group.Add(word.Substring(1));
    222                 }
    223             }
    224         }
    225         /// <summary>
    226         /// 检测
    227         /// </summary>
    228         /// <param name="blackWord"></param>
    229         /// <returns></returns>
    230         private bool Check(string blackWord)
    231         {
    232             wordlenght = 0;
    233             //检测源下一位游标
    234             nextCursor = cursor + 1;
    235             bool found = false;
    236             //遍历词的每一位做匹配
    237             for (int i = 0; i < blackWord.Length; i++)
    238             {
    239                 //特殊字符偏移游标
    240                 int offset = 0;
    241                 if (nextCursor >= sourctText.Length)
    242                 {
    243                     break;
    244                 }
    245                 else
    246                 {
    247                     //检测下位字符如果不是汉字 数字 字符 偏移量加1
    248                     for (int y = nextCursor; y < sourctText.Length; y++)
    249                     {
    250 
    251                         if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
    252                         {
    253                             offset++;
    254                             //避让特殊字符,下位游标如果>=字符串长度 跳出
    255                             if (nextCursor + offset >= sourctText.Length) break;
    256                             wordlenght++;
    257                         }
    258                         else break;
    259                     }
    260                     if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
    261                     {
    262                         found = true;
    263                     }
    264                     else
    265                     {
    266                         found = false;
    267                         break;
    268                     }
    269                 }
    270                 nextCursor = nextCursor + 1 + offset;
    271                 wordlenght++;
    272             }
    273             return found;
    274         }
    275         /// <summary>
    276         /// 检测并替换敏感词为指定字符。之后返回
    277         /// </summary>
    278         /// <param name="replaceChar">比如:*</param>
    279         public string getDataByFilter(string sSourceInput, char replaceChar)
    280         {
    281             if (string.IsNullOrEmpty(sSourceInput))
    282             {
    283                 return sSourceInput;
    284             }
    285             if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0)
    286             {
    287                 SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空");
    288                 return sSourceInput;
    289             }
    290             //初始化
    291             this.cursor = 0;
    292             this.wordlenght = 0;
    293             this.illegalWords.Clear();
    294             this.sourctText = sSourceInput;
    295             if (sourctText != string.Empty)
    296             {
    297                 char[] tempString = sourctText.ToCharArray();
    298                 for (int i = 0; i < SourctText.Length; i++)
    299                 {
    300                     //查询以该字为首字符的词组
    301                     WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
    302                     if (group != null)
    303                     {
    304                         for (int z = 0; z < group.Count(); z++)
    305                         {
    306                             string word = group.GetWord(z);
    307                             if (word.Length == 0 || Check(word))
    308                             {
    309                                 string blackword = string.Empty;
    310                                 for (int pos = 0; pos < wordlenght + 1; pos++)
    311                                 {
    312                                     blackword += tempString[pos + cursor].ToString();
    313                                     tempString[pos + cursor] = replaceChar;
    314                                 }
    315                                 illegalWords.Add(blackword);
    316                                 cursor = cursor + wordlenght;
    317                                 i = i + wordlenght;
    318                             }
    319                         }
    320                     }
    321                     cursor++;
    322                 }
    323                 return new string(tempString);
    324             }
    325             else
    326             {
    327                 return string.Empty;
    328             }
    329         }
    330     }
    331     /// <summary>
    332     /// 具有相同首字符的词组集合
    333     /// </summary>
    334     public class WordGroup
    335     {
    336         /// <summary>
    337         /// 集合
    338         /// </summary>
    339         private List<string> groupList=new List<string>();
    340         public WordGroup()
    341         {
    342 
    343         }
    344         /// <summary>
    345         /// 添加词
    346         /// </summary>
    347         /// <param name="word"></param>
    348         public void Add(string word)
    349         {
    350             if (groupList.Contains(word) == false)
    351             {
    352                 groupList.Add(word);
    353             }
    354         }
    355         /// <summary>
    356         /// 获取总数
    357         /// </summary>
    358         /// <returns></returns>
    359         public int Count()
    360         {
    361             return groupList.Count;
    362         }
    363         /// <summary>
    364         /// 根据下标获取词
    365         /// </summary>
    366         /// <param name="index"></param>
    367         /// <returns></returns>
    368         public string GetWord(int index)
    369         {
    370             return groupList[index];
    371         }
    372     }
    373 }

    上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:

     1   //全局配置,整个程序只要配置一次即可,后续无需配置
     2             SensitiveWordFilter.Init(new string[] {
     3                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master暴恐词库.txt",
     4                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master反动词库.txt",
     5                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master民生词库.txt",
     6                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master色情词库.txt",
     7                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master贪腐词库.txt",
     8                  @"C:UsersxDownloads网站需要过滤的敏感词mgck-master其他词库.txt"
     9             });//注:这里的路径一定要写正确,否则本算法无法生效。
    10             //下列可以在多个地方实例化,可以并发执行
    11             SensitiveWordFilter wordFilter = new SensitiveWordFilter();
    12             Dictionary<string, string> dictTestData = new Dictionary<string, string>();
    13             //多测几个示例,看看效果
    14             dictTestData["杀^人游戏,有人找一夜q"] = "";//注意,这里本来不是"一夜q",可惜咱们博客园本身也有敏感词检测,无法发布。所以改成q。  如果有人需要测试,请在本地改为词库里的一些内容。!!
    15             dictTestData["数学学习课堂"] = "";
    16             dictTestData["打击法0功有,法0功毒害大众"] = "";
    17             Dictionary<string, string> dictResult = new Dictionary<string, string>();
    18             foreach(string sKey in dictTestData.Keys)
    19             {
    20                 dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')},  ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}";
    21             }
    22             string sResultJson = JsonConverter.SerializeObject(dictResult);
    23             Utils.SaveLog(sResultJson);

     最后,给一下打印的结果:

    "杀^人游戏,有人找一夜q":     替换后: "杀^人游戏,有人找|||", ------------检测违禁词:一夜q",  
    "数学学习课堂":     替换后:"数学学习课堂", ------------检测违禁词:,
    "打击法0功有,法0功毒害大众":   替换后:"打击|||有,|||毒害大众", ------------检测违禁词:法0功,法0功"

    -------------附

    词库下载地址:https://codeload.github.com/chason777777/mgck/zip/master

  • 相关阅读:
    自动化单元测试
    Exadata是什么?
    Exadata的独门武器卸载(Offloading)
    Exadata中最有用的功能存储索引
    面向对象分析与设计(第3版)
    代码质量(权威精选植根于开发实践的最佳读物)
    温昱谈程序员向架构师转型的规律
    sql语句大全
    一个弹出层的代码
    ASP.NET 2.0 实现伪静态网页方法 (转载 ————续)
  • 原文地址:https://www.cnblogs.com/taohuadaozhu/p/12707700.html
Copyright © 2011-2022 走看看