zoukankan      html  css  js  c++  java
  • 基于.Net Framework 3.5的Lucene.Net 中文词组匹配分词器

    可以自己看看是不是很高效。为了加快速度,尽量精简了算法。测试表明,精确度还可以。
    由于没有实现完整的一套字典机制,而是普通的文本字典,所以就不提供完整源码下载了,贴出核心的源码。从版本完整度上来说只能算是0.6版。
    另外,本分词系统使用的词库是ShootAnalyzer的词库。

    使用方法:

    参考以下代码

     1         [TestMethod]
     2         public void TestMethod1()
     3         {
     4             //
     5             // TODO: 在此    添加测试逻辑
     6             //
     7 
     8             Participle p = new Participle();
     9             p.Init(@"D:\labs\xxxx");
    10             string txt = @"天下真的有神吗?我不是呀";
    11             string outstr = string.Empty;
    12             Stopwatch st = new Stopwatch();
    13             st.Start();
    14             outstr = p.TextSpliter(txt);
    15             st.Stop();
    16 
    17             Stopwatch st2 = new Stopwatch();
    18             st2.Start();
    19             List<string> hs = p.TextArray(txt);
    20             st2.Stop();
    21             Console.WriteLine(outstr);
    22             Console.WriteLine(st.ElapsedMilliseconds.ToString("f2"));
    23             Console.WriteLine(st2.ElapsedMilliseconds.ToString("f2"));
    24 
    25             YurowAnalyzer.YurowAnalyzer y = new YurowAnalyzer.YurowAnalyzer(@"D:\labs\xxxx");
    26             TokenStream t = y.TokenStream(nullnew StringReader(txt));
    27 
    28             Token token = t.Next();
    29             while (token != null)
    30             {
    31                 Console.WriteLine(token.TermText() + "\t" + token.StartOffset() + "\t" + token.EndOffset());
    32                 token = t.Next();
    33             }
    34             t.Close();
    35         }

    在Lucene.Net 索引或者搜索中直接使用YurowAnalyzer.YurowAnalyzer 分析器。


    下载地址:
    https://files.cnblogs.com/birdshover/YurowAnalyzer.rar


    下面贴上些关键源码:
    Participle类(分词类)
      1 
      2         public List<int> StartArr;
      3 
      4         public List<string> TextArray(string text)
      5         {
      6             List<string> hs = new List<string>();
      7             StartArr = new List<int>();
      8             int start = 0;
      9             for (int i = 0; i < text.Length; i++)
     10             {
     11                 char nowchar = text[i];
     12                 char nextchar = (i == text.Length - 1? '\0' : text[i + 1];
     13                 if (DataCatch.EnglishChar.Contains(nowchar))
     14                 {
     15                     if (start < 1)
     16                         start = i;
     17                     if (DataCatch.EnglishChar.Contains(nextchar))
     18                         i++;
     19                     else
     20                     {
     21                         hs.Add(text.Substring(start, i - start));
     22                         StartArr.Add(start);
     23                         start = 0;
     24                     }
     25                     continue;
     26                 }
     27 
     28                 if (DataCatch.Num.Contains(nowchar))
     29                 {
     30                     if (start < 1)
     31                         start = i;
     32                     if (DataCatch.Num.Contains(nextchar))
     33                     {
     34                         i++;
     35                     }
     36                     else
     37                     {
     38                         hs.Add(text.Substring(start, i - start));
     39                         StartArr.Add(start);
     40                         start = 0;
     41                     }
     42                     continue;
     43                 }
     44                 if (nowchar == ' ')
     45                 {
     46                     continue;
     47                 }
     48                 if (nextchar == ' ' || nextchar == '\0')
     49                 {
     50                     hs.Add(nowchar.ToString());
     51                     StartArr.Add(i);
     52                     i++;
     53                     continue;
     54                 }
     55                 if (DataCatch.GetDict().ContainsKey(nowchar) && DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
     56                 {
     57                     HashSet<string> list = DataCatch.GetDict()[nowchar][nextchar];
     58                     if (list.Count == 0)
     59                     {
     60                         hs.Add(nowchar.ToString() + nextchar.ToString());
     61                         StartArr.Add(i);
     62                         i++;
     63                         continue;
     64                     }
     65                     int maxnum = 0;
     66                     string temp = string.Empty;
     67                     string outstr = string.Empty;
     68                     foreach (string item in list)
     69                     {
     70                         if (text.Length - i > item.Length + 1)
     71                         {
     72                             temp = text.Substring(i + 2, item.Length);
     73                             if (list.Contains(temp))
     74                             {
     75                                 if (maxnum > item.Length)
     76                                     continue;
     77                                 else
     78                                 {
     79                                     maxnum = item.Length;
     80                                     outstr = temp;
     81                                 }
     82                             }
     83                         }
     84                     }
     85                     if (!string.IsNullOrEmpty(outstr))
     86                     {
     87                         hs.Add(nowchar.ToString() + nextchar.ToString() + outstr);
     88                         StartArr.Add(i);
     89                         i = i + maxnum + 1;
     90                     }
     91                     else
     92                     {
     93                         hs.Add(nowchar.ToString() + nextchar.ToString());
     94                         StartArr.Add(i);
     95                         i++;
     96                     }
     97                 }
     98                 else
     99                 {
    100                     hs.Add(nowchar.ToString());
    101                     StartArr.Add(i);
    102                 }
    103             }
    104             return hs;
    105         }



    DefaultDict类(加载分词具体实现)

    private Dictionary<char, Dictionary<char, HashSet<string>>> dictMemory = new Dictionary<char, Dictionary<char, HashSet<string>>>(DataCatch.InitPage);

     1 protected virtual void DoFormat()
     2         {
     3             Stream stream = new FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
     4             StreamReader sr = new StreamReader(stream, Encoding.Default);
     5             while (sr.Peek() > -1)
     6             {
     7                 string line = sr.ReadLine();
     8                 if (line.Length > 1)
     9                 {
    10                     char charfirst = line[0];
    11                     char charseconde = line[1];
    12                     string other = line.Length > 2 ? line.Remove(02) : null;
    13                     if (dictMemory.ContainsKey(charfirst))
    14                     {
    15                         if (dictMemory[charfirst].ContainsKey(charseconde))
    16                         {
    17                             HashSet<string> list = dictMemory[charfirst][charseconde];
    18                             if (!string.IsNullOrEmpty(other) && !list.Contains(other))
    19                                 list.Add(other);
    20                         }
    21                         else
    22                         {
    23                             HashSet<string> list = new HashSet<string>();
    24                             if (!string.IsNullOrEmpty(other))
    25                                 list.Add(other);
    26                             dictMemory[charfirst].Add(charseconde, list);
    27                         }
    28                     }
    29                     else
    30                     {
    31                         Dictionary<char, HashSet<string>> d = new Dictionary<char, HashSet<string>>();
    32                         HashSet<string> list = new HashSet<string>();
    33                         if (!string.IsNullOrEmpty(other))
    34                             list.Add(other);
    35                         d.Add(charseconde, list);
    36                         dictMemory.Add(charfirst, d);
    37                     }
    38                 }
    39             }
    40         }

    转换到Lucene接口

     1     public class YurowTokenizer : Tokenizer
     2     {
     3         private string text;
     4         private List<string> list;
     5         int current = 0;
     6         private string path;
     7         static Participle p;
     8         bool isfirstrun = true;
     9 
    10         public YurowTokenizer(TextReader textreader, string path)
    11         {
    12             text = textreader.ReadToEnd();
    13             this.path = path;
    14             if (p == null)
    15             {
    16                 p = new Participle();
    17                 p.Init(path);
    18             }
    19         }
    20 
    21         public override Token Next()
    22         {
    23             if (string.IsNullOrEmpty(text))
    24                 return null;
    25 
    26             if (isfirstrun)
    27             {
    28                 list = p.TextArray(text);
    29                 isfirstrun = false;
    30             }
    31             if (list.Count < 1 || current >= list.Count)
    32                 return null;
    33             int start = p.StartArr[current];
    34             string currentstr = list[current];
    35             Token token = new Token(currentstr, start, start + currentstr.Length);
    36             current++;
    37             return token;
    38         }
    39     }


    有兴趣的朋友可以自己反编译查看源码。暂时不提供完整源码。

    http://www.cnblogs.com/birdshover/ by yurow
  • 相关阅读:
    在win7中关闭UAC(用户存取控制)
    如何卸载那些没有出现在“add/remove program”中的程序?
    Win7 打开显示文件后缀
    有关windows 自动登录的registry Key
    原来如此 新学一招
    file's owner以及outlet与连线的理解
    学习笔记:正确使用PresentModalViewController
    学习笔记:UIViewController生命周期
    IPhone之自定义弹出窗口
    学习笔记:iOS 视图控制器(UIViewController)剖析
  • 原文地址:https://www.cnblogs.com/birdshover/p/1120517.html
Copyright © 2011-2022 走看看