zoukankan      html  css  js  c++  java
  • 写一点应用关于 Lucene.Net,snowball的重新组装(三)

    者:finallyliuyu

    具体实现如下:

    1。首先在SnowballAnalyzer.cs里面建立类myEwordEntity,这个类可以看做是snowball.cs的接口:主程序调用Snowball.cs最终目的是为了获得关于词的这样一个“实体”

    //词汇的实体类
    public class myEwordEntity
        {
            public string txtWord;//词的文本
            public string stemroot;//被过滤后词的词根
            public string posWord;//词的词性
            public int token_begin;//在文章中的开始位置
            public int token_end;//在文章中的结束位置
            public myEwordEntity()
            {
                txtWord = string.Empty;
                posWord = string.Empty;
                stemroot = string.Empty;
                token_begin = 0;
                token_end = 0;
            }
           
          
        }

    2.在SnowballAnalyzer.cs下面建立类stemmer。完成词根还原功能代码(见二中的链接)

    3。在SnowballAnalyzer.cs中的 class SnowballAnalyzer : Analyzer 做如下修改

           1.private System.String name;
            private System.Collections.Hashtable stopSet;//停用词表
            private string mModelPath; //词性标注软件模型所在位置

            /// <summary>Builds the named analyzer with no stop words. </summary>
            2。 public SnowballAnalyzer(System.String name)
            {
                //获得词性标注软件模型所在位置。模型文件一般放在本工程下面
                mModelPath = System.IO.Path.GetDirectoryName(
                System.Reflection.Assembly.GetExecutingAssembly().GetName().CodeBase);
                mModelPath = new System.Uri(mModelPath).LocalPath + @"\Models\";
                this.name = name;
            }

            /// <summary>Builds the named analyzer with the given stop words. </summary>
            public SnowballAnalyzer(System.String name, System.String[] stopWords)
                : this(name)
            {
                stopSet = StopFilter.MakeStopSet(stopWords);
            }

    3。重写TokenStream函数

    public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new StandardTokenizer(reader);
                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
                if (stopSet != null)
                    result = new StopFilter(result, stopSet);
                //从result NokenStream 分离出Token 来,确定词性。
                // result = new SnowballFilter(result, name);
                return result;
            }

    4。修改后的该类的主要工作函数,从TokenStream中获得词,词的位置,并标注词性

    public List<myEwordEntity> TokenStreamToEntityList(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = TokenStream(fieldName, reader);
                // TokenStream result2 = TokenStream(fieldName, reader);

                List<myEwordEntity> wordEnList = new List<myEwordEntity>();
                while (true)
                {
                    Token token = result.Next();
                    myEwordEntity entity = new myEwordEntity();
                    if (token == null)
                        break;
                    else
                    {
                        entity.token_begin = token.StartOffset();
                        entity.token_end = token.EndOffset();
                        entity.txtWord = token.TermText();//获得词汇文本
                        entity.stemroot = AfterStemed(entity.txtWord);
                        wordEnList.Add(entity);

                    }
                }

                ArrayList myposlist = new ArrayList();
                foreach (myEwordEntity entity in wordEnList)
                {
                    myposlist.Add(entity.txtWord);
                }
                EnglishMaximumEntropyPosTagger mTager = new EnglishMaximumEntropyPosTagger(mModelPath + "EnglishPOS.nbin", mModelPath + @"\Parser\tagdict");
                myposlist = mTager.Tag(myposlist);
                for (int i = 0; i < myposlist.Count; i++)
                {
                    wordEnList[i].posWord = myposlist[i].ToString();
                }

                //对每个词汇进行词根还原
                /* result2 = new SnowballFilter(result2, name);
                   int k=0;//工作下标
                  while(true)
                  {
                      Token token=result2.Next();
                      if (token == null)
                          break;
                      else
                      {
                          wordEnList[k].stemroot= token.TermText();
                          k++;
                      }
                  
                  }*/
                return wordEnList;

            }

    5。 词根还原

    public string AfterStemed(string input)
            {
                Stemmer s = new Stemmer();

                input = input.ToLower();
                char[] inputchar = input.ToCharArray();
                s.add(inputchar, inputchar.Length);
                s.stem();
                string u = s.stemerToString();
                return u;


            }

  • 相关阅读:
    UVALive 5983 MAGRID DP
    2015暑假训练(UVALive 5983
    poj 1426 Find The Multiple (BFS)
    poj 3126 Prime Path (BFS)
    poj 2251 Dungeon Master 3维bfs(水水)
    poj 3278 catch that cow BFS(基础水)
    poj3083 Children of the Candy Corn BFS&&DFS
    BZOJ1878: [SDOI2009]HH的项链 (离线查询+树状数组)
    洛谷P3178 [HAOI2015]树上操作(dfs序+线段树)
    洛谷P3065 [USACO12DEC]第一!First!(Trie树+拓扑排序)
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1812908.html
Copyright © 2011-2022 走看看