由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。
1
2
/**
3
* Copyright 2004-2005 The Apache Software Foundation
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*/
17
using System;
18
using System.Collections;
19
using System.IO;
20
21
using Lucene.Net.Analysis;
22
23
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24
{
25
/**
26
* Filters CJKTokenizer with StopFilter.
27
*
28
* @author Che, Dong
29
*/
30
public class CJKAnalyzer:Analyzer
31
{
32
//~ Static fields/initializers ---------------------------------------------
33
34
/**
35
* An array containing some common English words that are not usually
36
* useful for searching and some double-byte interpunctions.
37
*/
38
public static string[] STOP_WORDS = {
39
"a", "and", "are", "as", "at", "be",
40
"but", "by", "for", "if", "in",
41
"into", "is", "it", "no", "not",
42
"of", "on", "or", "s", "such", "t",
43
"that", "the", "their", "then",
44
"there", "these", "they", "this",
45
"to", "was", "will", "with", "",
46
"www"
47
};
48
49
//~ Instance fields --------------------------------------------------------
50
51
/**
52
* stop word list
53
*/
54
private Hashtable stopTable;
55
56
//~ Constructors -----------------------------------------------------------
57
58
/**
59
* Builds an analyzer which removes words in {@link #STOP_WORDS}.
60
*/
61
public CJKAnalyzer()
62
{
63
stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64
}
65
66
/**
67
* Builds an analyzer which removes words in the provided array.
68
*
69
* @param stopWords stop word array
70
*/
71
public CJKAnalyzer(string[] stopWords)
72
{
73
stopTable = StopFilter.MakeStopSet(stopWords);
74
}
75
76
//~ Methods ----------------------------------------------------------------
77
78
/**
79
* get token stream from input
80
*
81
* @param fieldName lucene field name
82
* @param reader input reader
83
* @return TokenStream
84
*/
85
public override TokenStream TokenStream(string fieldName, TextReader reader)
86
{
87
TokenStream ts=new CJKTokenizer(reader);
88
return new StopFilter(ts, stopTable);
89
//return new StopFilter(new CJKTokenizer(reader), stopTable);
90
}
91
}
92
}

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

1
2
3
/**
4
* Copyright 2004-2005 The Apache Software Foundation
5
*
6
* Licensed under the Apache License, Version 2.0 (the "License");
7
* you may not use this file except in compliance with the License.
8
* You may obtain a copy of the License at
9
*
10
* http://www.apache.org/licenses/LICENSE-2.0
11
*
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
17
*/
18
19
using System;
20
using System.Collections;
21
using System.IO;
22
23
using Lucene.Net.Analysis;
24
25
/**
26
* CJKTokenizer was modified from StopTokenizer which does a decent job for
27
* most European languages. It performs other token methods for double-byte
28
* Characters: the token will return at each two charactors with overlap match.<br>
29
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
30
* also need filter filter zero length token ""<br>
31
* for Digit: digit, '+', '#' will token as letter<br>
32
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
33
* please search <a
34
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
35
*
36
* @author Che, Dong
37
*/
38
namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
39
{
40
public class CJKTokenizer:Tokenizer
41
{
42
//~ Static fields/initializers ---------------------------------------------
43
44
/** Max word length */
45
private static int MAX_WORD_LEN = 255;
46
47
/** buffer size: */
48
private static int IO_BUFFER_SIZE = 256;
49
50
//~ Instance fields --------------------------------------------------------
51
52
/** word offset, used to imply which character(in ) is parsed */
53
private int offset = 0;
54
55
/** the index used only for ioBuffer */
56
private int bufferIndex = 0;
57
58
/** data length */
59
private int dataLen = 0;
60
61
/**
62
* character buffer, store the characters which are used to compose <br>
63
* the returned Token
64
*/
65
private char[] buffer = new char[MAX_WORD_LEN];
66
67
/**
68
* I/O buffer, used to store the content of the input(one of the <br>
69
* members of Tokenizer)
70
*/
71
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
72
73
/** word type: single=>ASCII double=>non-ASCII word=>default */
74
private string tokenType = "word";
75
76
/**
77
* tag: previous character is a cached double-byte character "C1C2C3C4"
78
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
79
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
80
*/
81
private bool preIsTokened = false;
82
83
//~ Constructors -----------------------------------------------------------
84
85
/**
86
* Construct a token stream processing the given input.
87
*
88
* @param in I/O reader
89
*/
90
public CJKTokenizer(TextReader reader)
91
{
92
input = reader;
93
}
94
95
//~ Methods ----------------------------------------------------------------
96
97
/**
98
* Returns the next token in the stream, or null at EOS.
99
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100
* for detail.
101
*
102
* @return Token
103
*
104
* @throws java.io.IOException - throw IOException when read error <br>
105
* hanppened in the InputStream
106
*
107
*/
108
public override Token Next()
109
{
110
/** how many character(s) has been stored in buffer */
111
int length = 0;
112
113
/** the position used to create Token */
114
int start = offset;
115
116
while (true)
117
{
118
/** current charactor */
119
char c;
120
121
122
offset++;
123
124
/*
125
if (bufferIndex >= dataLen)
126
{
127
dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
128
bufferIndex = 0;
129
}
130
*/
131
132
if (bufferIndex >= dataLen )
133
{
134
if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
135
{
136
dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137
bufferIndex = 0;
138
}
139
else
140
{
141
dataLen=0;
142
}
143
}
144
145
if (dataLen ==0)
146
{
147
if (length > 0)
148
{
149
if (preIsTokened == true)
150
{
151
length = 0;
152
preIsTokened = false;
153
}
154
155
break;
156
}
157
else
158
{
159
return null;
160
}
161
}
162
else
163
{
164
//get current character
165
c = ioBuffer[bufferIndex++];
166
}
167
168
//if the current character is ASCII or Extend ASCII
169
if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170
{
171
if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
172
{
173
/** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174
int i = (int) c;
175
i = i - 65248;
176
c = (char) i;
177
}
178
if the current character is a letter or "_" "+" "#
236
237
}
238
else
239
{
240
// non-ASCII letter, eg."C1C2C3C4"
291
}
292
}
293
294
return new Token(new String(buffer, 0, length), start, start + length,
295
tokenType
296
);
297
}
298
299
public bool IsAscii(char c)
300
{
301
return c<256 && c>=0;
302
}
303
304
public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305
{
306
return c<=0xFFEF && c>=0xFF00;
307
}
308
}
309
}

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

236

237

238

239

240

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309
