zoukankan      html  css  js  c++  java
  • .Net : 脏字处理类,效率很高。。。。。

    BadWordParse 类:

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Collections;
    using System.IO;

    namespace charCheck
    {
        
    public class BadWordParse
        {


            
    private HashSet<string> hash = new HashSet<string>();
            
    private byte[] fastCheck = new byte[char.MaxValue];
            
    private BitArray charCheck = new BitArray(char.MaxValue);
            
    private int maxWordLength = 0;
            
    private int minWordLength = int.MaxValue;
            
    private bool _isHave = false;
            
    private string _replaceString = "*";
            
    private char _splitString = '|';
            
    private string _newWord;
            
    private string _badWordFilePath;


            
    /// <summary>
            
    /// 是否含有脏字
            
    /// </summary>
            public bool IsHave
            {
                
    get { return _isHave; }
            }

            
    /// <summary>
            
    /// 替换后字符串
            
    /// </summary>
            public string ReplaceString
            {
                
    set { _replaceString = value; }
            }
            
    /// <summary>
            
    /// 脏字字典切割符
            
    /// </summary>
            public char SplitString
            {
                
    set { _splitString = value; }
            }

            
    /// <summary>
            
    /// 更新后的字符串
            
    /// </summary>
            public string NewWord
            {
                
    get { return _newWord; }
            }

            
    /// <summary>
            
    /// 脏字字典文档路径
            
    /// </summary>
            public string BadWordFilePath
            {
                
    get { return _badWordFilePath; }
                
    set { _badWordFilePath = value; }
            }

            
    public BadWordParse(string filePath)
            {
                _badWordFilePath 
    = filePath;
                
    string srList = string.Empty;
                
    if (File.Exists(_badWordFilePath))
                {
                    StreamReader sr 
    = new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312"));
                    srList 
    = sr.ReadToEnd();
                    sr.Close();
                    sr.Dispose();
                }
                
    string[] badwords = srList.Split('|');
                
    foreach (string word in badwords)
                {
                    maxWordLength 
    = Math.Max(maxWordLength, word.Length);
                    minWordLength 
    = Math.Min(minWordLength, word.Length);
                    
    for (int i = 0; i < 7 && i < word.Length; i++)
                    {
                        fastCheck[word[i]] 
    |= (byte)(1 << i);
                    }

                    
    for (int i = 7; i < word.Length; i++)
                    {
                        fastCheck[word[i]] 
    |= 0x80;
                    }

                    
    if (word.Length == 1)
                    {
                        charCheck[word[
    0]] = true;
                    }
                    
    else
                    {
                        hash.Add(word);
                    }
                }
            }
            
    public bool HasBadWord(string text)
            {
                
    int index = 0;

                
    while (index < text.Length)
                {


                    
    if ((fastCheck[text[index]] & 1== 0)
                    {
                        
    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1== 0) ;
                    }

                    
    //单字节检测
                    if (minWordLength == 1 && charCheck[text[index]])
                    {
                        
    return true;
                    }


                    
    //多字节检测
                    for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                    {
                        
    //快速排除
                        if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                        {
                            
    break;
                        }

                        
    if (j + 1 >= minWordLength)
                        {
                            
    string sub = text.Substring(index, j + 1);

                            
    if (hash.Contains(sub))
                            {
                                
    return true;
                            }
                        }
                    }
                    index
    ++;
                }
                
    return false;
            }

            
    public string ReplaceBadWord(string text)
            {
                
    int index = 0;

                
    for (index = 0; index < text.Length; index++)
                {
                    
    if ((fastCheck[text[index]] & 1== 0)
                    {
                        
    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1== 0) ;
                    }

                    
    //单字节检测
                    if (minWordLength == 1 && charCheck[text[index]])
                    {
                        
    //return true;
                        _isHave = true;
                        text 
    = text.Replace(text[index], _replaceString[0]);
                        
    continue;
                    }
                    
    //多字节检测
                    for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                    {

                        
    //快速排除
                        if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                        {
                            
    break;
                        }

                        
    if (j + 1 >= minWordLength)
                        {
                            
    string sub = text.Substring(index, j + 1);

                            
    if (hash.Contains(sub))
                            {

                                
    //替换字符操作
                                _isHave = true;
                                
    char cc = _replaceString[0];
                                
    string rp = _replaceString.PadRight((j + 1), cc);
                                text 
    = text.Replace(sub, rp);
                                
    //记录新位置
                                index += j;
                                
    break;
                            }
                        }
                    }
                }
                _newWord 
    = text;
                
    return text;
            }
        }


    }

    测试代码:

    代码
     string filePath = "F://charCheck/charCheck/badword.txt";  
                
    string testString = "";
                System.IO.StreamReader sr 
    = new System.IO.StreamReader(filePath, System.Text.Encoding.GetEncoding("gb2312"));
                
    //testString = sr.ReadToEnd();
                sr.Close();
                sr.Dispose();
                
    //uint t = GetTickCount();
                BadWordParse bwp = new BadWordParse(filePath);
                
    string parsedString = bwp.ReplaceBadWord(testString);
                
    //uint time = GetTickCount() - t;
                
    //Console.Write("使用时间:" + time.ToString());
                
    //Console.Write("\r\n");
                
    //Console.Write("原始字符串" + parsedString);
                
    //Console.Write("\r\n");
                
    //Console.Write("替换后字符串" + parsedString);
  • 相关阅读:
    Mysql Select 语句中实现的判断
    SQL根据一个字符串集合循环保存数据库
    SQL语句 不足位数补0
    SVN常见错误
    svn架构
    关于EXCEL显示数字
    exception from hresult:0x8000401A(excel文档导出)
    WIN7安装注意事项
    解决卸载时残留目标文件夹的问题
    Installshield执行多条批处理的方法
  • 原文地址:https://www.cnblogs.com/Fooo/p/1630710.html
Copyright © 2011-2022 走看看