zoukankan      html  css  js  c++  java
  • 脏字过滤算法

    原文 http://www.cnblogs.com/xingd/archive/2009/06/23/1061800.html

    程序包下载Word.rar

    修改后

     public class DirtyWordOper
        {
            private static Dictionary<string, object> hash = new Dictionary<string, object>();
            private static BitArray firstCharCheck = new BitArray(char.MaxValue);//把脏词的第一个字符记录下来
            private static BitArray allCharCheck = new BitArray(char.MaxValue);//把每一个个脏词的所有字符都记录下来
            private static int maxLength = 0;//
            private static bool onlyOne = true;

            #region
            /// <summary>
            /// 返回替换后的字符串 字符串的长度不变
            /// </summary>
            /// <param name="text"></param>
            /// <returns></returns>
            public string Replace(string text)
            {
                if (onlyOne)
                {
                    Init();//初始化数据 执行一次就不会执行了
                    onlyOne = false;
                }
                if (!isDirtyword(text))
                {
                    return text;
                }
                //获取替换操作表
                List<DetailRepModel> drlist = GetList(text);
                //执行替换操作
                return Replace2(text, drlist);
            }

            /// <summary>
            /// 初始化用  只执行一次
            /// </summary>
            /// <param name="text"></param>
            private static void Init()
            {
                string[] badwords = DirtyWordData.DirtyKeyword.Split('|');
                foreach (string bw in badwords)
                {
                    string[] strarrtemp = bw.Split('&');
                    string word = strarrtemp[0];
                    word = word.Trim();//去掉数据中的空格及格式 符号
                    word = word.Replace("/r", "");
                    word = word.Replace("/n", "");
                    if (word == "")
                    {
                        break;
                    }
                    if (!hash.ContainsKey(word))
                    {
                        hash.Add(word, null);
                        maxLength = Math.Max(maxLength, word.Length);
                        firstCharCheck[word[0]] = true;

                        foreach (char c in word)
                        {
                            allCharCheck[c] = true;
                        }
                    }
                }
            }
            /// <summary>
            /// 是否包含 了 脏 词
            /// </summary>
            /// <param name="text"></param>
            /// <returns></returns>
            private static bool isDirtyword(string text)
            {
                int index = 0;
                //int offset = 0;
                while (index < text.Length)
                {
                    //如果第一个字符都不符合
                    if (!firstCharCheck[text[index]])
                    {// 直接找到与脏词第一字符相同为止
                        while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                    }
                    for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                    {
                        if (!allCharCheck[text[index + j - 1]])
                        {
                            break;
                        }
                        string sub = text.Substring(index, j);
                        //判定脏字字典中是否包括了脏词
                        if (hash.ContainsKey(sub))
                        {
                            return true;//是
                        }
                    }
                    index++;
                }
                return false;//否
            }

            /// <summary>
            /// 返回操作列表
            /// </summary>
            /// <param name="text"></param>
            /// <returns></returns>
            private static List<DetailRepModel> GetList(string text)
            {
                List<DetailRepModel> DetailList = new List<DetailRepModel>();
                int index = 0;
                while (index < text.Length)
                {
                    if (!firstCharCheck[text[index]])
                    {
                        while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                    }
                    DetailRepModel tempDetail = null;
                    for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                    {
                        if (!allCharCheck[text[index + j - 1]])
                        {
                            if (tempDetail != null)
                            {//优先先字符串替换
                                index = index + tempDetail.number - 1;//索引要返回上一位,所以要减1
                                DetailList.Add(tempDetail);
                            }
                            break;
                        }
                        string sub = text.Substring(index, j);
                        if (hash.ContainsKey(sub))
                        {
                            tempDetail = new DetailRepModel();
                            tempDetail.index = index;
                            tempDetail.number = sub.Length;
                            tempDetail.content = sub;
                            //break;//进行下一次 不然要出现, abc 其中ab 与a都关键字要生成两个操作                      
                        }
                        if (tempDetail != null)
                        {
                            if (j + 1 > Math.Min(maxLength, text.Length - index))
                            {//优先先字符串替换
                                DetailList.Add(tempDetail);
                                index = index + tempDetail.number - 1;//索引要返回上一位,所以要减1
                            }
                        }
                    }
                    index++;
                }
                return DetailList;
            }
            /// <summary>
            /// 传入 字串和 脏字替换操作表,
            /// </summary>
            /// <param name="text"></param>
            /// <param name="drlist"></param>
            /// <returns> 输出替换后的字串</returns>
            private static string Replace2(string text, List<DetailRepModel> drlist)
            {


                if (drlist == null || drlist.Count == 0 || text == "")
                {
                    return text;
                }
                foreach (DetailRepModel dr in drlist)
                {
                    if (dr != null)
                    {
                        string strtemp = text.Substring(dr.index, dr.number);
                        object ob = DirtyWordData.DirtyHT[(object)strtemp];
                        if (ob == null)
                        {
                            //记录错误
                            break;
                        }
                        // 这样替换 有错误 ,
                        text = text.Substring(0, dr.index) + ob.ToString() + text.Substring(dr.index + dr.number);
                        //text = text.Replace(strtemp, ob.ToString());
                    }
                }
                return text;
            }
            #endregion
        }

    效果还行, 不过我们老大给我说了个方法更NB,说比这种要快50倍;只是写起来有点麻烦

     public interface IReplaceDW
        {
            string Replace(string s);
        }
        public class ReplaceDW
        {
            public static void AddToWords(DirtyChar parent, string s, string t)
            {
                DirtyChar dc = parent.Children.Find(o => o.Orienginal == s[0]);
                if (dc == null)
                {
                    dc = new DirtyChar() { Orienginal = s[0], Children = new List<DirtyChar>(), Target = "" };
                    parent.Children.Add(dc);
                }
                if (s.Length > 1)
                {//
                    AddToWords(dc, s.Substring(1), t);
                }
                else
                {
                    dc.Target = t;
                }
            }

            public static string BuildChildren(DirtyChar dc, int deepLevel)
            {
                StringBuilder sb = new StringBuilder();
                string spaces = new string(' ', deepLevel + 4);

                if (dc.Children.Count > 0)
                {
                    sb.Append(@"
    " + spaces + @"if (i + 1 == len){");
                    sb.Append(@"
    " + spaces + @"    sb.Append(""" + dc.Target + @""");
                    ");
                    sb.Append(@"
    " + spaces + @"    i++;
    " + spaces + @"    break;}");
                    sb.Append(@"
    " + spaces + @" switch (s[i + " + deepLevel.ToString() + @"])
    " + spaces + @" {
    ");
                    foreach (DirtyChar c in dc.Children)
                    {
                        sb.Append(@"
    " + spaces + @"  case '" + c.Orienginal + @"':
    ");
                        sb.Append(BuildChildren(c, deepLevel + 1));
                        sb.Append(@"
    " + spaces + @"   break;");
                    }
                  
                  
                    sb.Append(@"
    " + spaces + @" default:
    " + spaces + @"    sb.Append(""" + dc.Target + @""");
    " + spaces + @"    i++;
    " + spaces + @"    break;
    " + spaces + @" }
    ");
                }
                else
                {
                    sb.Append(@"
    " + spaces + @"  sb.Append(""" + dc.Target + @""");
    ");
                    if (deepLevel == 1)
                    {
                        sb.Append(@"
    " + spaces + @"  i++;
    ");
                    }
                    else
                    {
                        sb.Append(@"
    " + spaces + @"  i += " + (deepLevel).ToString() + @";
    ");
                    }
                }
                return sb.ToString();
            }


            private IReplaceDW _r = null;
            private static bool isfirst = true;
            public string Replace(string s)
            {
                return _r.Replace(s);
            }
            private static List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
            public ReplaceDW()
            {
                if (isfirst)
                {              
                    List<KeyValuePair<string, string>> dict = new List<KeyValuePair<string, string>>();
                    foreach (DictionaryEntry d in KeyWord.DirtyWordData.DirtyHT)
                    {
                        dict.Add(new KeyValuePair<string, string>(d.Key.ToString(), d.Value.ToString()));
                    }
                    // 整理进 list
                    //List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
                    foreach (KeyValuePair<string, string> kv in dict)
                    {
                        tmp.Add(kv);
                    }
                    // 倒排
                    tmp.Sort((a, b) => { return b.Key.CompareTo(a.Key); });
                    isfirst = false;
                }
                var compiler = new CSharpCodeProvider();
                var options = new CompilerParameters();

                // set compile options  
                options.CompilerOptions = "/o";
                options.GenerateExecutable = false;
                options.GenerateInMemory = true;
                options.ReferencedAssemblies.Add("System.dll");
                options.ReferencedAssemblies.Add(this.GetType().Assembly.Location);

                // set the source code to compile  
                DirtyChar words = new DirtyChar() { Children = new List<DirtyChar>() };
                //DirtyChar words2 = new DirtyChar();
                //words2.Children = new List<DirtyChar>();
                foreach (KeyValuePair<string, string> kv in tmp)
                {//构建字典表
                    AddToWords(words, kv.Key, kv.Value);
                }


                StringBuilder sb = new StringBuilder();
                sb.Append(@"
    using System;  
    namespace KeyWord
    {
    public class ReplaceDW_ : IReplaceDW
    {  
        public string Replace( string s )
     {  
      int len = s.Length, i = 0;
            System.Text.StringBuilder sb = new System.Text.StringBuilder(len);
    ");
                sb.Append(@"
      while (i < len)
      {
       switch (s[i])
       {
    ");
                foreach (DirtyChar c in words.Children)
                {
                    sb.Append(@"
        case '" + c.Orienginal + @"':
    ");
                    sb.Append(BuildChildren(c, 1));
                    sb.Append(@"
         break;");
                }
                sb.Append(@"
        default:
         sb.Append(s[i++]);
         break;
       }
      }
    ");
                sb.Append(@"
      return sb.ToString();

        }  
    }
    }");
                // compile the code, on-the-fly  
                var result = compiler.CompileAssemblyFromSource(options, sb.ToString());
               
                foreach (var error in result.Errors)
                {
                    // print errors  
                    ;
                }

                // if compilation sucessed  
                if ((!result.Errors.HasErrors) && (result.CompiledAssembly != null))
                {
                    var type = result.CompiledAssembly.GetType("KeyWord.ReplaceDW_");
                    try
                    {
                        if (type != null)
                        {
                            this._r = Activator.CreateInstance(type) as IReplaceDW;
                        }
                        this.Replace("x"); //预热
                        this.Replace("x"); //预热
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex);
                    }
                }
            }
        }

  • 相关阅读:
    oracle 的一点累积
    ZT: 网页的一些技巧
    ZT: WEB学习资料
    开源java
    倒序显示文本
    plsql使用之debug
    转 一些shell经验
    lpad rpad
    2018.8.19 2018暑假集训之maxnum
    2018.8.17 题解 2018暑假集训之编辑距离
  • 原文地址:https://www.cnblogs.com/j349900963/p/1887929.html
Copyright © 2011-2022 走看看