zoukankan      html  css  js  c++  java
  • lucene下的一个自定义分词

    public class ICTCLASAnalyzer : Analyzer
        {
            //定义要过滤的词
            public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[428];
            public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";
            public ICTCLASAnalyzer()
            {
                StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.Default);
                string noise = reader.ReadLine();
                int i = 0;
                while (!string.IsNullOrEmpty(noise))
                {
                    CHINESE_ENGLISH_STOP_WORDS[i] = noise;
                    noise = reader.ReadLine();
                    i++;
                    if (i >= 428)
                        break;
                }
            }

            public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new ICTCLASTokenizer(reader);
                result = new StandardFilter(result);
                result = new LowerCaseFilter(result);
              
                result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
                return result;
            }


        }

  • 相关阅读:
    OpenShift
    ant exec
    深入了解Ant构建工具 命令
    防止sql注入和跨站脚本攻击,跨站请求伪造以及一句话木马的学习记录
    Web攻防之XSS,CSRF,SQL注入(转)
    sublime text常用快捷键(转)
    fiddler使用心得记录
    python+tesseract验证码识别的一点小心得
    window脚本命令学习(转)
    python发送邮件(转)
  • 原文地址:https://www.cnblogs.com/wycg1984/p/1722402.html
Copyright © 2011-2022 走看看