zoukankan      html  css  js  c++  java
  • 二元分词(Lucene CJK Analyzer).Net版

    由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。

     1
     2/**
     3 * Copyright 2004-2005 The Apache Software Foundation
     4 *
     5 * Licensed under the Apache License, Version 2.0 (the "License");
     6 * you may not use this file except in compliance with the License.
     7 * You may obtain a copy of the License at
     8 *
     9 *     http://www.apache.org/licenses/LICENSE-2.0
    10 *
    11 * Unless required by applicable law or agreed to in writing, software
    12 * distributed under the License is distributed on an "AS IS" BASIS,
    13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14 * See the License for the specific language governing permissions and
    15 * limitations under the License.
    16 */

    17using System;
    18using System.Collections;
    19using System.IO;
    20
    21using Lucene.Net.Analysis;
    22
    23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
    24{
    25    /**
    26     * Filters CJKTokenizer with StopFilter.
    27     *
    28     * @author Che, Dong
    29     */

    30    public class CJKAnalyzer:Analyzer 
    31    {
    32        //~ Static fields/initializers ---------------------------------------------
    33
    34        /**
    35         * An array containing some common English words that are not usually
    36         * useful for searching and some double-byte interpunctions.
    37         */

    38        public  static string[] STOP_WORDS = {
    39                                                 "a""and""are""as""at""be",
    40                                                 "but""by""for""if""in",
    41                                                 "into""is""it""no""not",
    42                                                 "of""on""or""s""such""t",
    43                                                 "that""the""their""then",
    44                                                 "there""these""they""this",
    45                                                 "to""was""will""with""",
    46                                                 "www"
    47                                             }
    ;
    48
    49        //~ Instance fields --------------------------------------------------------
    50
    51        /**
    52         * stop word list
    53         */

    54        private Hashtable stopTable;
    55
    56        //~ Constructors -----------------------------------------------------------
    57
    58        /**
    59         * Builds an analyzer which removes words in {@link #STOP_WORDS}.
    60         */

    61        public CJKAnalyzer() 
    62        {
    63            stopTable = StopFilter.MakeStopSet(STOP_WORDS);
    64        }

    65
    66        /**
    67         * Builds an analyzer which removes words in the provided array.
    68         *
    69         * @param stopWords stop word array
    70         */

    71        public CJKAnalyzer(string[] stopWords) 
    72        {
    73            stopTable = StopFilter.MakeStopSet(stopWords);
    74        }

    75
    76        //~ Methods ----------------------------------------------------------------
    77
    78        /**
    79         * get token stream from input
    80         *
    81         * @param fieldName lucene field name
    82         * @param reader    input reader
    83         * @return TokenStream
    84         */

    85        public override TokenStream TokenStream(string fieldName, TextReader reader) 
    86        {
    87            TokenStream ts=new CJKTokenizer(reader);
    88            return new StopFilter(ts, stopTable);
    89            //return new StopFilter(new CJKTokenizer(reader), stopTable);
    90        }

    91    }

    92}

      1
      2
      3/**
      4 * Copyright 2004-2005 The Apache Software Foundation
      5 *
      6 * Licensed under the Apache License, Version 2.0 (the "License");
      7 * you may not use this file except in compliance with the License.
      8 * You may obtain a copy of the License at
      9 *
     10 *     http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing, software
     13 * distributed under the License is distributed on an "AS IS" BASIS,
     14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 * See the License for the specific language governing permissions and
     16 * limitations under the License.
     17 */

     18
     19using System;
     20using System.Collections;
     21using System.IO;
     22
     23using Lucene.Net.Analysis;
     24
     25/**
     26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
     27 * most European languages. It performs other token methods for double-byte
     28 * Characters: the token will return at each two charactors with overlap match.<br>
     29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
     30 * also need filter filter zero length token ""<br>
     31 * for Digit: digit, '+', '#' will token as letter<br>
     32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
     33 * please search  <a
     34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
     35 *
     36 * @author Che, Dong
     37 */

     38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
     39{
     40    public  class CJKTokenizer:Tokenizer 
     41    {
     42        //~ Static fields/initializers ---------------------------------------------
     43
     44        /** Max word length */
     45        private static int MAX_WORD_LEN = 255;
     46
     47        /** buffer size: */
     48        private static int IO_BUFFER_SIZE = 256;
     49
     50        //~ Instance fields --------------------------------------------------------
     51
     52        /** word offset, used to imply which character(in ) is parsed */
     53        private int offset = 0;
     54
     55        /** the index used only for ioBuffer */
     56        private int bufferIndex = 0;
     57
     58        /** data length */
     59        private int dataLen = 0;
     60
     61        /**
     62         * character buffer, store the characters which are used to compose <br>
     63         * the returned Token
     64         */

     65        private  char[] buffer = new char[MAX_WORD_LEN];
     66
     67        /**
     68         * I/O buffer, used to store the content of the input(one of the <br>
     69         * members of Tokenizer)
     70         */

     71        private  char[] ioBuffer = new char[IO_BUFFER_SIZE];
     72
     73        /** word type: single=>ASCII  double=>non-ASCII word=>default */
     74        private string tokenType = "word";
     75
     76        /**
     77         * tag: previous character is a cached double-byte character  "C1C2C3C4"
     78         * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
     79         * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
     80         */

     81        private bool preIsTokened = false;
     82
     83        //~ Constructors -----------------------------------------------------------
     84
     85        /**
     86         * Construct a token stream processing the given input.
     87         *
     88         * @param in I/O reader
     89         */

     90        public CJKTokenizer(TextReader reader) 
     91        {
     92            input = reader;
     93        }

     94
     95        //~ Methods ----------------------------------------------------------------
     96
     97        /**
     98         * Returns the next token in the stream, or null at EOS.
     99         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
    100         * for detail.
    101         *
    102         * @return Token
    103         *
    104         * @throws java.io.IOException - throw IOException when read error <br>
    105         *         hanppened in the InputStream
    106         *
    107         */

    108        public override Token Next()
    109        {
    110            /** how many character(s) has been stored in buffer */
    111            int length = 0;
    112
    113            /** the position used to create Token */
    114            int start = offset;
    115
    116            while (true
    117            {
    118                /** current charactor */
    119                char c;
    120
    121            
    122                offset++;
    123
    124                /*
    125                 if (bufferIndex >= dataLen) 
    126                 {
    127                        dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
    128                        bufferIndex = 0;
    129                 }
    130                 */

    131
    132                if (bufferIndex >= dataLen ) 
    133                {
    134                    if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
    135                    {
    136                        dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
    137                        bufferIndex = 0;
    138                    }

    139                    else
    140                    {
    141                        dataLen=0;
    142                    }

    143                }

    144
    145                if (dataLen ==0
    146                {
    147                    if (length > 0
    148                    {
    149                        if (preIsTokened == true
    150                        {
    151                            length = 0;
    152                            preIsTokened = false;
    153                        }

    154
    155                        break;
    156                    }
     
    157                    else 
    158                    {
    159                        return null;
    160                    }

    161                }
     
    162                else 
    163                {
    164                    //get current character
    165                    c = ioBuffer[bufferIndex++];
    166                }

    167
    168                //if the current character is ASCII or Extend ASCII
    169                if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
    170                {
    171                    if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c)) 
    172                    {
    173                        /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
    174                        int i = (int) c;
    175                        i = i - 65248;
    176                        c = (char) i;
    177                    }

    178                    if the current character is a letter or "_" "+" "#
    236
    237                }
     
    238                else 
    239                {
    240                    // non-ASCII letter, eg."C1C2C3C4"
    291                }

    292            }

    293
    294            return new Token(new String(buffer, 0, length), start, start + length,
    295                tokenType
    296                );
    297        }

    298
    299        public bool     IsAscii(char c)
    300        {
    301            return c<256 && c>=0;
    302        }

    303        
    304        public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
    305        {
    306            return c<=0xFFEF && c>=0xFF00;
    307        }

    308    }

    309}
  • 相关阅读:
    SQL Server 2008登录错误:无法连接到(local)解决方法
    HTML5 学习
    DNS服务器的配置与管理
    如何把TOMCAT 添加到服务中自动启动
    如何获取WIN10 Program Files 文件夹下的文件操作权限
    Oracle PL/SQL入门语法点
    【Oracle XE系列之三】使用OMF方式手工创建Oracle XE数据库
    【Oracle XE系列之二】PLSQL Developer 远程连接Oracle XE数据库
    【Oracle XE系列之一】Windows10_X64环境 安装Oracle XE11gR2 X64数据库
    Spark 调优
  • 原文地址:https://www.cnblogs.com/nsharp/p/571371.html
Copyright © 2011-2022 走看看