zoukankan      html  css  js  c++  java
  • 二元分词(Lucene CJK Analyzer).Net版

    由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。

     1
     2/**
     3 * Copyright 2004-2005 The Apache Software Foundation
     4 *
     5 * Licensed under the Apache License, Version 2.0 (the "License");
     6 * you may not use this file except in compliance with the License.
     7 * You may obtain a copy of the License at
     8 *
     9 *     http://www.apache.org/licenses/LICENSE-2.0
    10 *
    11 * Unless required by applicable law or agreed to in writing, software
    12 * distributed under the License is distributed on an "AS IS" BASIS,
    13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14 * See the License for the specific language governing permissions and
    15 * limitations under the License.
    16 */

    17using System;
    18using System.Collections;
    19using System.IO;
    20
    21using Lucene.Net.Analysis;
    22
    23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
    24{
    25    /**
    26     * Filters CJKTokenizer with StopFilter.
    27     *
    28     * @author Che, Dong
    29     */

    30    public class CJKAnalyzer:Analyzer 
    31    {
    32        //~ Static fields/initializers ---------------------------------------------
    33
    34        /**
    35         * An array containing some common English words that are not usually
    36         * useful for searching and some double-byte interpunctions.
    37         */

    38        public  static string[] STOP_WORDS = {
    39                                                 "a""and""are""as""at""be",
    40                                                 "but""by""for""if""in",
    41                                                 "into""is""it""no""not",
    42                                                 "of""on""or""s""such""t",
    43                                                 "that""the""their""then",
    44                                                 "there""these""they""this",
    45                                                 "to""was""will""with""",
    46                                                 "www"
    47                                             }
    ;
    48
    49        //~ Instance fields --------------------------------------------------------
    50
    51        /**
    52         * stop word list
    53         */

    54        private Hashtable stopTable;
    55
    56        //~ Constructors -----------------------------------------------------------
    57
    58        /**
    59         * Builds an analyzer which removes words in {@link #STOP_WORDS}.
    60         */

    61        public CJKAnalyzer() 
    62        {
    63            stopTable = StopFilter.MakeStopSet(STOP_WORDS);
    64        }

    65
    66        /**
    67         * Builds an analyzer which removes words in the provided array.
    68         *
    69         * @param stopWords stop word array
    70         */

    71        public CJKAnalyzer(string[] stopWords) 
    72        {
    73            stopTable = StopFilter.MakeStopSet(stopWords);
    74        }

    75
    76        //~ Methods ----------------------------------------------------------------
    77
    78        /**
    79         * get token stream from input
    80         *
    81         * @param fieldName lucene field name
    82         * @param reader    input reader
    83         * @return TokenStream
    84         */

    85        public override TokenStream TokenStream(string fieldName, TextReader reader) 
    86        {
    87            TokenStream ts=new CJKTokenizer(reader);
    88            return new StopFilter(ts, stopTable);
    89            //return new StopFilter(new CJKTokenizer(reader), stopTable);
    90        }

    91    }

    92}

      1
      2
      3/**
      4 * Copyright 2004-2005 The Apache Software Foundation
      5 *
      6 * Licensed under the Apache License, Version 2.0 (the "License");
      7 * you may not use this file except in compliance with the License.
      8 * You may obtain a copy of the License at
      9 *
     10 *     http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing, software
     13 * distributed under the License is distributed on an "AS IS" BASIS,
     14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 * See the License for the specific language governing permissions and
     16 * limitations under the License.
     17 */

     18
     19using System;
     20using System.Collections;
     21using System.IO;
     22
     23using Lucene.Net.Analysis;
     24
     25/**
     26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
     27 * most European languages. It performs other token methods for double-byte
     28 * Characters: the token will return at each two charactors with overlap match.<br>
     29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
     30 * also need filter filter zero length token ""<br>
     31 * for Digit: digit, '+', '#' will token as letter<br>
     32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
     33 * please search  <a
     34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
     35 *
     36 * @author Che, Dong
     37 */

     38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
     39{
     40    public  class CJKTokenizer:Tokenizer 
     41    {
     42        //~ Static fields/initializers ---------------------------------------------
     43
     44        /** Max word length */
     45        private static int MAX_WORD_LEN = 255;
     46
     47        /** buffer size: */
     48        private static int IO_BUFFER_SIZE = 256;
     49
     50        //~ Instance fields --------------------------------------------------------
     51
     52        /** word offset, used to imply which character(in ) is parsed */
     53        private int offset = 0;
     54
     55        /** the index used only for ioBuffer */
     56        private int bufferIndex = 0;
     57
     58        /** data length */
     59        private int dataLen = 0;
     60
     61        /**
     62         * character buffer, store the characters which are used to compose <br>
     63         * the returned Token
     64         */

     65        private  char[] buffer = new char[MAX_WORD_LEN];
     66
     67        /**
     68         * I/O buffer, used to store the content of the input(one of the <br>
     69         * members of Tokenizer)
     70         */

     71        private  char[] ioBuffer = new char[IO_BUFFER_SIZE];
     72
     73        /** word type: single=>ASCII  double=>non-ASCII word=>default */
     74        private string tokenType = "word";
     75
     76        /**
     77         * tag: previous character is a cached double-byte character  "C1C2C3C4"
     78         * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
     79         * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
     80         */

     81        private bool preIsTokened = false;
     82
     83        //~ Constructors -----------------------------------------------------------
     84
     85        /**
     86         * Construct a token stream processing the given input.
     87         *
     88         * @param in I/O reader
     89         */

     90        public CJKTokenizer(TextReader reader) 
     91        {
     92            input = reader;
     93        }

     94
     95        //~ Methods ----------------------------------------------------------------
     96
     97        /**
     98         * Returns the next token in the stream, or null at EOS.
     99         * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
    100         * for detail.
    101         *
    102         * @return Token
    103         *
    104         * @throws java.io.IOException - throw IOException when read error <br>
    105         *         hanppened in the InputStream
    106         *
    107         */

    108        public override Token Next()
    109        {
    110            /** how many character(s) has been stored in buffer */
    111            int length = 0;
    112
    113            /** the position used to create Token */
    114            int start = offset;
    115
    116            while (true
    117            {
    118                /** current charactor */
    119                char c;
    120
    121            
    122                offset++;
    123
    124                /*
    125                 if (bufferIndex >= dataLen) 
    126                 {
    127                        dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
    128                        bufferIndex = 0;
    129                 }
    130                 */

    131
    132                if (bufferIndex >= dataLen ) 
    133                {
    134                    if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
    135                    {
    136                        dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
    137                        bufferIndex = 0;
    138                    }

    139                    else
    140                    {
    141                        dataLen=0;
    142                    }

    143                }

    144
    145                if (dataLen ==0
    146                {
    147                    if (length > 0
    148                    {
    149                        if (preIsTokened == true
    150                        {
    151                            length = 0;
    152                            preIsTokened = false;
    153                        }

    154
    155                        break;
    156                    }
     
    157                    else 
    158                    {
    159                        return null;
    160                    }

    161                }
     
    162                else 
    163                {
    164                    //get current character
    165                    c = ioBuffer[bufferIndex++];
    166                }

    167
    168                //if the current character is ASCII or Extend ASCII
    169                if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
    170                {
    171                    if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c)) 
    172                    {
    173                        /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
    174                        int i = (int) c;
    175                        i = i - 65248;
    176                        c = (char) i;
    177                    }

    178                    if the current character is a letter or "_" "+" "#
    236
    237                }
     
    238                else 
    239                {
    240                    // non-ASCII letter, eg."C1C2C3C4"
    291                }

    292            }

    293
    294            return new Token(new String(buffer, 0, length), start, start + length,
    295                tokenType
    296                );
    297        }

    298
    299        public bool     IsAscii(char c)
    300        {
    301            return c<256 && c>=0;
    302        }

    303        
    304        public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
    305        {
    306            return c<=0xFFEF && c>=0xFF00;
    307        }

    308    }

    309}
  • 相关阅读:
    Atitit 集团与个人的完整入口列表 attilax的完整入口 1. 集团与个人的完整入口列表 1 2. 流量入口概念 2 3. 流量入口的历史与发展 2 1.集团与个人的完整入口列表
    atitit 每季度日程表 每季度流程 v3 qaf.docx Ver history V2 add diary cyar data 3 cate V3 fix detail 3cate ,
    Atitit react 详细使用总结 绑定列表显示 attilax总结 1. 前言 1 1.1. 资料数量在百度内的数量对比 1 1.2. 版本16 v15.6.1 1 1.3. 引入js 2
    Atitit r2017 r3 doc list on home ntpc.docx
    Atitit r2017 ra doc list on home ntpc.docx
    Atiitt attilax掌握的前后技术放在简历里面.docx
    Atitit q2016 qa doc list on home ntpc.docx
    Atitit r7 doc list on home ntpc.docx 驱动器 D 中的卷是 p2soft 卷的序列号是 9AD0D3C8 D:\ati\r2017 v3 r01\
    Atitit 可移植性之道attilax著
    Atitit q2016 q5 doc list on home ntpc.docx
  • 原文地址:https://www.cnblogs.com/nsharp/p/571371.html
Copyright © 2011-2022 走看看