zoukankan      html  css  js  c++  java
  • 写一点应用关于 Lucene.Net,snowball的重新组装(三)

    者:finallyliuyu

    具体实现如下:

    1。首先在SnowballAnalyzer.cs里面建立类myEwordEntity,这个类可以看做是snowball.cs的接口:主程序调用Snowball.cs最终目的是为了获得关于词的这样一个“实体”

    //词汇的实体类
    public class myEwordEntity
        {
            public string txtWord;//词的文本
            public string stemroot;//被过滤后词的词根
            public string posWord;//词的词性
            public int token_begin;//在文章中的开始位置
            public int token_end;//在文章中的结束位置
            public myEwordEntity()
            {
                txtWord = string.Empty;
                posWord = string.Empty;
                stemroot = string.Empty;
                token_begin = 0;
                token_end = 0;
            }
           
          
        }

    2.在SnowballAnalyzer.cs下面建立类stemmer。完成词根还原功能代码(见二中的链接)

    3。在SnowballAnalyzer.cs中的 class SnowballAnalyzer : Analyzer 做如下修改

           1.private System.String name;
            private System.Collections.Hashtable stopSet;//停用词表
            private string mModelPath; //词性标注软件模型所在位置

            /// <summary>Builds the named analyzer with no stop words. </summary>
            2。 public SnowballAnalyzer(System.String name)
            {
                //获得词性标注软件模型所在位置。模型文件一般放在本工程下面
                mModelPath = System.IO.Path.GetDirectoryName(
                System.Reflection.Assembly.GetExecutingAssembly().GetName().CodeBase);
                mModelPath = new System.Uri(mModelPath).LocalPath + @"\Models\";
                this.name = name;
            }

            /// <summary>Builds the named analyzer with the given stop words. </summary>
            public SnowballAnalyzer(System.String name, System.String[] stopWords)
                : this(name)
            {
                stopSet = StopFilter.MakeStopSet(stopWords);
            }

    3。重写TokenStream函数

    public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(reader);
                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                if (stopSet != null)
                    result = new StopFilter(result, stopSet);
                //从result NokenStream 分离出Token 来,确定词性。
                // result = new SnowballFilter(result, name);
                return result;
            }

    4。修改后的该类的主要工作函数,从TokenStream中获得词,词的位置,并标注词性

    public List<myEwordEntity> TokenStreamToEntityList(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = TokenStream(fieldName, reader);
                // TokenStream result2 = TokenStream(fieldName, reader);

                List<myEwordEntity> wordEnList = new List<myEwordEntity>();
                while (true)
                {
                    Token token = result.Next();
                    myEwordEntity entity = new myEwordEntity();
                    if (token == null)
                        break;
                    else
                    {
                        entity.token_begin = token.StartOffset();
                        entity.token_end = token.EndOffset();
                        entity.txtWord = token.TermText();//获得词汇文本
                        entity.stemroot = AfterStemed(entity.txtWord);
                        wordEnList.Add(entity);

                    }
                }

                ArrayList myposlist = new ArrayList();
                foreach (myEwordEntity entity in wordEnList)
                {
                    myposlist.Add(entity.txtWord);
                }
                EnglishMaximumEntropyPosTagger mTager = new EnglishMaximumEntropyPosTagger(mModelPath + "EnglishPOS.nbin", mModelPath + @"\Parser\tagdict");
                myposlist = mTager.Tag(myposlist);
                for (int i = 0; i < myposlist.Count; i++)
                {
                    wordEnList[i].posWord = myposlist[i].ToString();
                }

                //对每个词汇进行词根还原
                /* result2 = new SnowballFilter(result2, name);
                   int k=0;//工作下标
                  while(true)
                  {
                      Token token=result2.Next();
                      if (token == null)
                          break;
                      else
                      {
                          wordEnList[k].stemroot= token.TermText();
                          k++;
                      }
                  
                  }*/
                return wordEnList;

            }

    5。 词根还原

    public string AfterStemed(string input)
            {
                Stemmer s = new Stemmer();

                input = input.ToLower();
                char[] inputchar = input.ToCharArray();
                s.add(inputchar, inputchar.Length);
                s.stem();
                string u = s.stemerToString();
                return u;


            }

  • 相关阅读:
    删除ubuntu多余内核
    Linux从入门到精通(第8章--磁盘管理)
    图书销售管理系统概要设计,系统数据结构设计分工
    图书管理销售系统,出错设计部分
    图书管理销售系统概要分析,接口设计部分
    图书管理销售系统,运行设计部分
    图书管理销售管理系统,总体设计部分
    图书销售管理概要分析报告,引言部分
    图书销售管理系统概要分析报告总体分工
    图书销售管理系统需求分析,各种功能图部分
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1812908.html
Copyright © 2011-2022 走看看