1 #ifndef _HZSEG_H_040415_ 2 #define _HZSEG_H_040415_ 3 4 #include <iostream> 5 #include <string> 6 #include <cstring> 7 #include <cstdlib> 8 #include <fstream> 9 #include "Dict.h" 10 11 using namespace std; 12 13 class CHzSeg 14 { 15 public: 16 CHzSeg(); 17 ~CHzSeg(); 18 19 string SegmentSentenceMM (CDict&, string) const;//只保留下中文 20 string SegmentHzStrMM (CDict&, string) const;//切词 21 string SegmentURL(CDict&, string) const; 22 23 // process a sentence before segmentation 24 void Translate(char* SourceStr) const; 25 }; 26 27 #endif /* _HZSEG_H_040415_ */
1 // HzSeg handling 2 3 #include "HzSeg.h" 4 #include "Dict.h" 5 6 const unsigned int MAX_WORD_LENGTH = 8; 7 const string SEPARATOR("/ "); // delimiter between words 8 9 CHzSeg::CHzSeg() { 10 } 11 12 CHzSeg::~CHzSeg() { 13 } 14 15 // Using Max Matching method to segment a character string. 16 string CHzSeg::SegmentHzStrMM(CDict &dict, string s1) const { 17 string s2 = ""; //保存句子s1的分词结果 18 while (!s1.empty()) { 19 unsigned int len = s1.size(); 20 21 if (len > MAX_WORD_LENGTH) 22 len = MAX_WORD_LENGTH; 23 //如果待切分的句子大于最大切分单元 24 //len=最大切分单元,否则len=句子的长度 25 26 string w = s1.substr(0, len); //取s1句子最左边长度len为的子句子 27 bool isw = dict.IsWord(w); //判断刚刚取出来的子句子是不是一个词 28 29 while (len > 2 && isw == false) { //当w中至少有2个中文字&&不能构成字的时候,减去最右边的一个中文字符 30 len -= 2; // cut a word 31 w = w.substr(0, len); 32 isw = dict.IsWord(w); 33 } 34 s2 += w + SEPARATOR; 35 36 s1 = s1.substr(w.size()); 37 } 38 39 return s2; 40 } 41 42 // process a sentence before segmentation 43 string CHzSeg::SegmentSentenceMM(CDict &dict, string s1) const { 44 string s2 = ""; 45 unsigned int i, len; 46 cout << endl << "I'm in SegmentSentenceMM" << endl; 47 cout << s1 << endl; 48 while (!s1.empty()) { 49 unsigned char ch = (unsigned char) s1[0]; 50 if (ch < 128) { //吃掉一行中所有换行符以外的英文字符 51 i = 1; 52 len = s1.size(); 53 while (i < len && ((unsigned char) s1[i] < 128) && (s1[i] != 10) 54 && (s1[i] != 13)) { // LF, CR 55 i++; //不是回车换行 56 } 57 58 if ((ch != 32) && (ch != 10) && (ch != 13)) { // SP, LF, CR 59 s2 += s1.substr(0, i) + SEPARATOR; 60 } else { 61 if (ch == 10 || ch == 13) { 62 s2 += s1.substr(0, i); 63 cout << "当前s2:" << s2 << endl; 64 } 65 } 66 67 if (i <= s1.size()) { 68 s1 = s1.substr(i); //获得删去部分英文字符后的数据 69 } else 70 break; // 处理完英文字符 71 72 continue; 73 74 } else { 75 if (ch < 176) { //中文标点等非汉字字符128<=ch<176 76 i = 0; 77 len = s1.length(); 78 79 while (i < len && ((unsigned char) s1[i] < 176) 80 && ((unsigned char) s1[i] >= 161) 81 && (!((unsigned char) s1[i] == 161 82 && ((unsigned char) s1[i + 1] >= 162 83 && (unsigned char) s1[i + 1] <= 168))) 84 && (!((unsigned char) s1[i] == 161 85 && ((unsigned char) s1[i + 1] >= 171 86 && (unsigned char) s1[i + 1] <= 191))) 87 && (!((unsigned char) s1[i] == 163 88 && ((unsigned char) s1[i + 1] == 172 89 || (unsigned char) s1[i + 1] == 161) 90 || (unsigned char) s1[i + 1] == 168 91 || (unsigned char) s1[i + 1] == 169 92 || (unsigned char) s1[i + 1] == 186 93 || (unsigned char) s1[i + 1] == 187 94 || (unsigned char) s1[i + 1] == 191))) { 95 i = i + 2; //假定没有半个汉字 96 } 97 98 if (i == 0) 99 i = i + 2; 100 101 if (!(ch == 161 && (unsigned char) s1[1] == 161)) { // 不处理中文空格 102 if (i <= s1.size()) // yhf 103 s2 += s1.substr(0, i) + SEPARATOR; // 其他的非汉字双字节字符可能连续输出 104 else 105 break; // yhf 106 } 107 108 if (i <= s1.size()) { 109 s1 = s1.substr(i); //取s1从下标i开始的子字符串 110 111 } else 112 break; //yhf 113 114 continue; 115 } 116 } 117 118 i = 2; 119 len = s1.length(); 120 while (i < len && (unsigned char) s1[i] >= 176) 121 // while(i<len && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161) 122 i += 2; 123 124 s2 += SegmentHzStrMM(dict, s1.substr(0, i)); 125 126 if (i <= len) // yhf 127 s1 = s1.substr(i); 128 else 129 break; // yhf 130 } 131 132 return s2; 133 } 134 135 // translate the encoded URL(%xx) to actual chars 136 void CHzSeg::Translate(char* SourceStr) const { 137 int i = 0; 138 int j = 0; 139 char *tempstr, tempchar1, tempchar2; 140 141 tempstr = (char*) malloc(strlen(SourceStr) + 1); 142 if (tempstr == NULL) { 143 return; 144 } 145 146 while (SourceStr[j]) { 147 if ((tempstr[i] = SourceStr[j]) == '%') { 148 if (SourceStr[j + 1] >= 'A') 149 tempchar1 = ((SourceStr[j + 1] & 0xdf) - 'A') + 10; 150 else 151 tempchar1 = (SourceStr[j + 1] - '0'); 152 if (SourceStr[j + 2] >= 'A') 153 tempchar2 = ((SourceStr[j + 2] & 0xdf) - 'A') + 10; 154 else 155 tempchar2 = (SourceStr[j + 2] - '0'); 156 tempstr[i] = tempchar1 * 16 + tempchar2; 157 j = j + 2; 158 } 159 i++; 160 j++; 161 } 162 tempstr[i] = '\0'; 163 strcpy(SourceStr, tempstr); 164 165 if (tempstr) 166 free(tempstr); 167 } 168 169 /* 170 * segment the image URL by '/' 171 * omit the domain name 172 */ 173 string CHzSeg::SegmentURL(CDict &dict, string url) const { 174 string::size_type idx, nidx; 175 char *curl = (char *) url.c_str(); 176 this->Translate(curl); 177 url = curl; 178 if ((idx = url.find("http://", 0)) != string::npos) { 179 if ((nidx = url.find("/", 7)) != string::npos) { 180 url = url.substr(nidx + 1); // cut the part of sitename 181 } 182 } 183 idx = 0; 184 while ((idx = url.find("/", idx)) != string::npos) { 185 url.replace(idx, 1, SEPARATOR); // replace "/" with SEPARATOR "/ " 186 idx += 3; 187 } 188 if ((idx = url.rfind(".")) != string::npos) { 189 url = url.erase(idx); // erase the file extension 190 } 191 192 url += "/ "; 193 194 // segment the string whose length is greater than 8 (4 HZ_chars) 195 idx = 0; 196 nidx = 0; 197 bool isover = false; 198 string stmp; 199 while (!isover) { 200 if ((nidx = url.find(SEPARATOR, idx)) == string::npos) 201 isover = true; 202 if (nidx - idx > 0) { 203 stmp = url.substr(idx, nidx - idx); 204 stmp = SegmentSentenceMM(dict, stmp); 205 if (stmp.size() >= 3) 206 stmp.erase(stmp.length() - 3); // erase the tail "/ " 207 url = url.replace(idx, nidx - idx, stmp); 208 idx += stmp.length() + 3; 209 } else if (nidx == string::npos && idx < url.length()) { 210 stmp = url.substr(idx); 211 stmp = SegmentSentenceMM(dict, stmp); 212 stmp.erase(stmp.length() - 3); 213 url = url.substr(0, idx) + stmp; 214 } else 215 idx = nidx + 3; 216 } 217 218 return url; 219 220 }