由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。
1
2
/**
3
* Copyright 2004-2005 The Apache Software Foundation
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*/
17
using System;
18
using System.Collections;
19
using System.IO;
20
21
using Lucene.Net.Analysis;
22
23
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24
{
25
/**
26
* Filters CJKTokenizer with StopFilter.
27
*
28
* @author Che, Dong
29
*/
30
public class CJKAnalyzer:Analyzer
31
{
32
//~ Static fields/initializers ---------------------------------------------
33
34
/**
35
* An array containing some common English words that are not usually
36
* useful for searching and some double-byte interpunctions.
37
*/
38
public static string[] STOP_WORDS = {
39
"a", "and", "are", "as", "at", "be",
40
"but", "by", "for", "if", "in",
41
"into", "is", "it", "no", "not",
42
"of", "on", "or", "s", "such", "t",
43
"that", "the", "their", "then",
44
"there", "these", "they", "this",
45
"to", "was", "will", "with", "",
46
"www"
47
};
48
49
//~ Instance fields --------------------------------------------------------
50
51
/**
52
* stop word list
53
*/
54
private Hashtable stopTable;
55
56
//~ Constructors -----------------------------------------------------------
57
58
/**
59
* Builds an analyzer which removes words in {@link #STOP_WORDS}.
60
*/
61
public CJKAnalyzer()
62
{
63
stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64
}
65
66
/**
67
* Builds an analyzer which removes words in the provided array.
68
*
69
* @param stopWords stop word array
70
*/
71
public CJKAnalyzer(string[] stopWords)
72
{
73
stopTable = StopFilter.MakeStopSet(stopWords);
74
}
75
76
//~ Methods ----------------------------------------------------------------
77
78
/**
79
* get token stream from input
80
*
81
* @param fieldName lucene field name
82
* @param reader input reader
83
* @return TokenStream
84
*/
85
public override TokenStream TokenStream(string fieldName, TextReader reader)
86
{
87
TokenStream ts=new CJKTokenizer(reader);
88
return new StopFilter(ts, stopTable);
89
//return new StopFilter(new CJKTokenizer(reader), stopTable);
90
}
91
}
92
}

2
/**3
* Copyright 2004-2005 The Apache Software Foundation4
*5
* Licensed under the Apache License, Version 2.0 (the "License");6
* you may not use this file except in compliance with the License.7
* You may obtain a copy of the License at8
*9
* http://www.apache.org/licenses/LICENSE-2.010
*11
* Unless required by applicable law or agreed to in writing, software12
* distributed under the License is distributed on an "AS IS" BASIS,13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14
* See the License for the specific language governing permissions and15
* limitations under the License.16
*/17
using System;18
using System.Collections;19
using System.IO;20

21
using Lucene.Net.Analysis;22

23
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk24
{25
/**26
* Filters CJKTokenizer with StopFilter.27
*28
* @author Che, Dong29
*/30
public class CJKAnalyzer:Analyzer 31
{32
//~ Static fields/initializers ---------------------------------------------33

34
/**35
* An array containing some common English words that are not usually36
* useful for searching and some double-byte interpunctions.37
*/38
public static string[] STOP_WORDS = {39
"a", "and", "are", "as", "at", "be",40
"but", "by", "for", "if", "in",41
"into", "is", "it", "no", "not",42
"of", "on", "or", "s", "such", "t",43
"that", "the", "their", "then",44
"there", "these", "they", "this",45
"to", "was", "will", "with", "",46
"www"47
};48

49
//~ Instance fields --------------------------------------------------------50

51
/**52
* stop word list53
*/54
private Hashtable stopTable;55

56
//~ Constructors -----------------------------------------------------------57

58
/**59
* Builds an analyzer which removes words in {@link #STOP_WORDS}.60
*/61
public CJKAnalyzer() 62
{63
stopTable = StopFilter.MakeStopSet(STOP_WORDS);64
}65

66
/**67
* Builds an analyzer which removes words in the provided array.68
*69
* @param stopWords stop word array70
*/71
public CJKAnalyzer(string[] stopWords) 72
{73
stopTable = StopFilter.MakeStopSet(stopWords);74
}75

76
//~ Methods ----------------------------------------------------------------77

78
/**79
* get token stream from input80
*81
* @param fieldName lucene field name82
* @param reader input reader83
* @return TokenStream84
*/85
public override TokenStream TokenStream(string fieldName, TextReader reader) 86
{87
TokenStream ts=new CJKTokenizer(reader);88
return new StopFilter(ts, stopTable);89
//return new StopFilter(new CJKTokenizer(reader), stopTable);90
}91
}92
} 1
2
3
/**
4
* Copyright 2004-2005 The Apache Software Foundation
5
*
6
* Licensed under the Apache License, Version 2.0 (the "License");
7
* you may not use this file except in compliance with the License.
8
* You may obtain a copy of the License at
9
*
10
* http://www.apache.org/licenses/LICENSE-2.0
11
*
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
17
*/
18
19
using System;
20
using System.Collections;
21
using System.IO;
22
23
using Lucene.Net.Analysis;
24
25
/**
26
* CJKTokenizer was modified from StopTokenizer which does a decent job for
27
* most European languages. It performs other token methods for double-byte
28
* Characters: the token will return at each two charactors with overlap match.<br>
29
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
30
* also need filter filter zero length token ""<br>
31
* for Digit: digit, '+', '#' will token as letter<br>
32
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
33
* please search <a
34
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
35
*
36
* @author Che, Dong
37
*/
38
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
39
{
40
public class CJKTokenizer:Tokenizer
41
{
42
//~ Static fields/initializers ---------------------------------------------
43
44
/** Max word length */
45
private static int MAX_WORD_LEN = 255;
46
47
/** buffer size: */
48
private static int IO_BUFFER_SIZE = 256;
49
50
//~ Instance fields --------------------------------------------------------
51
52
/** word offset, used to imply which character(in ) is parsed */
53
private int offset = 0;
54
55
/** the index used only for ioBuffer */
56
private int bufferIndex = 0;
57
58
/** data length */
59
private int dataLen = 0;
60
61
/**
62
* character buffer, store the characters which are used to compose <br>
63
* the returned Token
64
*/
65
private char[] buffer = new char[MAX_WORD_LEN];
66
67
/**
68
* I/O buffer, used to store the content of the input(one of the <br>
69
* members of Tokenizer)
70
*/
71
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
72
73
/** word type: single=>ASCII double=>non-ASCII word=>default */
74
private string tokenType = "word";
75
76
/**
77
* tag: previous character is a cached double-byte character "C1C2C3C4"
78
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
79
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
80
*/
81
private bool preIsTokened = false;
82
83
//~ Constructors -----------------------------------------------------------
84
85
/**
86
* Construct a token stream processing the given input.
87
*
88
* @param in I/O reader
89
*/
90
public CJKTokenizer(TextReader reader)
91
{
92
input = reader;
93
}
94
95
//~ Methods ----------------------------------------------------------------
96
97
/**
98
* Returns the next token in the stream, or null at EOS.
99
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100
* for detail.
101
*
102
* @return Token
103
*
104
* @throws java.io.IOException - throw IOException when read error <br>
105
* hanppened in the InputStream
106
*
107
*/
108
public override Token Next()
109
{
110
/** how many character(s) has been stored in buffer */
111
int length = 0;
112
113
/** the position used to create Token */
114
int start = offset;
115
116
while (true)
117
{
118
/** current charactor */
119
char c;
120
121
122
offset++;
123
124
/*
125
if (bufferIndex >= dataLen)
126
{
127
dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
128
bufferIndex = 0;
129
}
130
*/
131
132
if (bufferIndex >= dataLen )
133
{
134
if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
135
{
136
dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137
bufferIndex = 0;
138
}
139
else
140
{
141
dataLen=0;
142
}
143
}
144
145
if (dataLen ==0)
146
{
147
if (length > 0)
148
{
149
if (preIsTokened == true)
150
{
151
length = 0;
152
preIsTokened = false;
153
}
154
155
break;
156
}
157
else
158
{
159
return null;
160
}
161
}
162
else
163
{
164
//get current character
165
c = ioBuffer[bufferIndex++];
166
}
167
168
//if the current character is ASCII or Extend ASCII
169
if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170
{
171
if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
172
{
173
/** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174
int i = (int) c;
175
i = i - 65248;
176
c = (char) i;
177
}
178
if the current character is a letter or "_" "+" "#
236
237
}
238
else
239
{
240
// non-ASCII letter, eg."C1C2C3C4"
291
}
292
}
293
294
return new Token(new String(buffer, 0, length), start, start + length,
295
tokenType
296
);
297
}
298
299
public bool IsAscii(char c)
300
{
301
return c<256 && c>=0;
302
}
303
304
public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305
{
306
return c<=0xFFEF && c>=0xFF00;
307
}
308
}
309
}

2

3
/**4
* Copyright 2004-2005 The Apache Software Foundation5
*6
* Licensed under the Apache License, Version 2.0 (the "License");7
* you may not use this file except in compliance with the License.8
* You may obtain a copy of the License at9
*10
* http://www.apache.org/licenses/LICENSE-2.011
*12
* Unless required by applicable law or agreed to in writing, software13
* distributed under the License is distributed on an "AS IS" BASIS,14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.15
* See the License for the specific language governing permissions and16
* limitations under the License.17
*/18

19
using System;20
using System.Collections;21
using System.IO;22

23
using Lucene.Net.Analysis;24

25
/**26
* CJKTokenizer was modified from StopTokenizer which does a decent job for27
* most European languages. It performs other token methods for double-byte28
* Characters: the token will return at each two charactors with overlap match.<br>29
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it30
* also need filter filter zero length token ""<br>31
* for Digit: digit, '+', '#' will token as letter<br>32
* for more info on Asia language(Chinese Japanese Korean) text segmentation:33
* please search <a34
* href="http://www.google.com/search?q=word+chinese+segment">google</a>35
*36
* @author Che, Dong37
*/38
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk39
{40
public class CJKTokenizer:Tokenizer 41
{42
//~ Static fields/initializers ---------------------------------------------43

44
/** Max word length */45
private static int MAX_WORD_LEN = 255;46

47
/** buffer size: */48
private static int IO_BUFFER_SIZE = 256;49

50
//~ Instance fields --------------------------------------------------------51

52
/** word offset, used to imply which character(in ) is parsed */53
private int offset = 0;54

55
/** the index used only for ioBuffer */56
private int bufferIndex = 0;57

58
/** data length */59
private int dataLen = 0;60

61
/**62
* character buffer, store the characters which are used to compose <br>63
* the returned Token64
*/65
private char[] buffer = new char[MAX_WORD_LEN];66

67
/**68
* I/O buffer, used to store the content of the input(one of the <br>69
* members of Tokenizer)70
*/71
private char[] ioBuffer = new char[IO_BUFFER_SIZE];72

73
/** word type: single=>ASCII double=>non-ASCII word=>default */74
private string tokenType = "word";75

76
/**77
* tag: previous character is a cached double-byte character "C1C2C3C4"78
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)79
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"80
*/81
private bool preIsTokened = false;82

83
//~ Constructors -----------------------------------------------------------84

85
/**86
* Construct a token stream processing the given input.87
*88
* @param in I/O reader89
*/90
public CJKTokenizer(TextReader reader) 91
{92
input = reader;93
}94

95
//~ Methods ----------------------------------------------------------------96

97
/**98
* Returns the next token in the stream, or null at EOS.99
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html100
* for detail.101
*102
* @return Token103
*104
* @throws java.io.IOException - throw IOException when read error <br>105
* hanppened in the InputStream106
*107
*/108
public override Token Next()109
{110
/** how many character(s) has been stored in buffer */111
int length = 0;112

113
/** the position used to create Token */114
int start = offset;115

116
while (true) 117
{118
/** current charactor */119
char c;120

121
122
offset++;123

124
/*125
if (bufferIndex >= dataLen) 126
{127
dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,128
bufferIndex = 0;129
}130
*/131

132
if (bufferIndex >= dataLen ) 133
{134
if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常135
{136
dataLen = input.Read(ioBuffer,0,ioBuffer.Length);137
bufferIndex = 0;138
}139
else140
{141
dataLen=0;142
}143
}144

145
if (dataLen ==0) 146
{147
if (length > 0) 148
{149
if (preIsTokened == true) 150
{151
length = 0;152
preIsTokened = false;153
}154

155
break;156
} 157
else 158
{159
return null;160
}161
} 162
else 163
{164
//get current character165
c = ioBuffer[bufferIndex++];166
}167

168
//if the current character is ASCII or Extend ASCII169
if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))170
{171
if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c)) 172
{173
/** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */174
int i = (int) c;175
i = i - 65248;176
c = (char) i;177
}178
if the current character is a letter or "_" "+" "#236

237
} 238
else 239
{240
// non-ASCII letter, eg."C1C2C3C4"291
}292
}293

294
return new Token(new String(buffer, 0, length), start, start + length,295
tokenType296
);297
}298

299
public bool IsAscii(char c)300
{301
return c<256 && c>=0;302
}303
304
public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)305
{306
return c<=0xFFEF && c>=0xFF00;307
}308
}309
}
