1 #ifndef ALGORITHM_WUMANBER_H 2 #define ALGORITHM_WUMANBER_H 3 4 #include <vector> 5 #include <string> 6 #include <set> 7 8 typedef std::set<std::string> ResultSetType; 9 typedef std::vector<unsigned int> MatchPosVector; 10 typedef std::pair<unsigned int, int> PrefixIdPairType; 11 typedef std::vector<PrefixIdPairType> PrefixTableType; 12 13 class WuManber 14 { 15 public: 16 WuManber(); 17 ~WuManber(); 18 /** 19 * Init Function 20 * 21 * @param patterns pattern list to be matched 22 */ 23 bool Init(const std::vector<std::string>& patterns); 24 25 /** 26 * @param text raw text 27 * @param textLength length of text 28 * @param res string set containing matched patterns 29 * 30 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 31 */ 32 int Search( const char* text, const int textLength, ResultSetType& res); 33 34 /** 35 * @param str raw text 36 * @param res string set containing matched patterns 37 * 38 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 39 */ 40 int Search(const std::string& str, ResultSetType& res); 41 42 /** 43 * @brief Search text 44 * 45 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 46 */ 47 int Search(const char* text, const int textLength); 48 49 /** 50 * @param str raw text 51 * param matchPosVector vector containing matched patterns postion 52 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 53 */ 54 int Search(const char* text, const int textLength, MatchPosVector &matchPosVector); 55 56 /** 57 * param matchPosVector vector containing matched patterns postion 58 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 59 */ 60 int Search(const std::string& str, MatchPosVector &matchPosVector); 61 62 /** 63 * @brief Search text 64 * 65 * @return value 0: no pattern matchs, n: n patterns matched(n>0) 66 */ 67 int Search(const std::string& str); 68 69 private: 70 // minmum length of patterns 71 int32_t mMin; 72 // SHIFT table 73 std::vector<int32_t> mShiftTable; 74 // a combination of HASH and PREFIX table 75 std::vector<PrefixTableType> mHashTable; 76 // patterns 77 std::vector<std::string> mPatterns; 78 // size of SHIFT and HASH table 79 int32_t mTableSize; 80 // size of block 81 int32_t mBlock; 82 }; 83 84 #endif
1 #include <cmath> 2 #include <iostream> 3 #include "wumanber.h" 4 5 using namespace std; 6 7 /** 8 * @brief String hash function. 9 * 10 * @param str the string needed to be hashed 11 * @param len length of the substr should be hashed 12 * 13 * @return hash code 14 */ 15 unsigned int HashCode(const char* str, int len) 16 { 17 unsigned int hash = 0; 18 while (*str && len>0) 19 { 20 hash = (*str++) + (hash << 6) + (hash << 16) - hash; 21 --len; 22 } 23 return (hash & 0x7FFFFFFF); 24 } 25 26 /** 27 * @brief constructor 28 */ 29 WuManber::WuManber():mMin(0), mTableSize(0), mBlock(3) 30 { 31 //VOID 32 } 33 34 /** 35 * @brief Init 36 */ 37 bool WuManber::Init(const vector<string>& patterns) 38 { 39 int patternSize = patterns.size(); 40 41 //check if no pattern specified 42 if (patternSize == 0) 43 { 44 //cerr << "Error: wumanber init failed because no pattern specified." << endl; 45 return false; 46 } 47 48 //caculate the minmum pattern length 49 mMin = patterns[0].length(); 50 int32_t lenPattern = 0; 51 for (int i = 0; i < patternSize; ++i) 52 { 53 lenPattern = patterns[i].length(); 54 if (lenPattern < mMin) 55 { 56 mMin = lenPattern; 57 } 58 } 59 60 //check if mBlock larger than mMin 61 if (mBlock > mMin) 62 { 63 //cerr << "Warning: mBlock is larger than minmum pattern length, reset mBlock to minmum, but it will seriously affect the effiency." << endl; 64 mBlock = mMin; 65 } 66 67 //choose a suitable mTableSize for SHIFT, HASH table 68 int32_t primes[6] = {1003, 10007, 100003, 1000003, 10000019, 100000007}; 69 vector<int32_t> primeList(&primes[0], &primes[6]); 70 71 int32_t threshold = 10 * mMin; 72 for (size_t i = 0; i < primeList.size(); ++i) 73 { 74 if (primeList[i] > patternSize && primeList[i] / patternSize > threshold) 75 { 76 mTableSize = primeList[i]; 77 break; 78 } 79 } 80 cout << mTableSize << " " << mBlock << " " << mMin << endl; 81 //if size of patternList is huge. 82 if (0 == mTableSize) 83 { 84 //cerr << "Warning: amount of pattern is very large, will cost a great amount of memory." << endl; 85 mTableSize = primeList[5]; 86 } 87 88 //construct ShiftTable and HashTable, and set default value for SHIFT table 89 mPatterns = patterns; 90 mHashTable.resize(mTableSize); 91 // default value is m-mBlock+1 for shift 92 int32_t defaultValue = mMin - mBlock + 1; 93 mShiftTable.resize(mTableSize, defaultValue); 94 95 //loop through patterns 96 for (int id = 0; id < patternSize; ++id) 97 { 98 // loop through each pattern from right to left 99 for (int index = mMin; index >= mBlock; --index) 100 { 101 unsigned int hash = HashCode(patterns[id].c_str() + index - mBlock, mBlock) % mTableSize; 102 if (mShiftTable[hash] > (mMin - index)) 103 { 104 mShiftTable[hash] = mMin - index; 105 } 106 if (index == mMin) 107 { 108 unsigned int prefixHash = HashCode(patterns[id].c_str(), mBlock); 109 mHashTable[hash].push_back(make_pair(prefixHash, id)); 110 } 111 } 112 } 113 cout << "Term number : " << mPatterns.size() << endl; 114 return true; 115 } 116 117 /** 118 * @brief destructor 119 */ 120 WuManber::~WuManber() 121 { 122 //VOID 123 } 124 125 126 /** 127 * @public 128 * @brief search multiple pattern in text at one time 129 */ 130 int WuManber::Search(const char* text, const int textLength, ResultSetType& res) 131 { 132 //hit count: value to be returned 133 int hits = 0; 134 int32_t index = mMin - 1; // start off by matching end of largest common pattern 135 136 int32_t blockMaxIndex = mBlock - 1; 137 int32_t windowMaxIndex = mMin - 1; 138 139 while (index < textLength) 140 { 141 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock); 142 blockHash = blockHash % mTableSize; 143 int shift = mShiftTable[blockHash]; 144 if (shift > 0) 145 { 146 index += shift; 147 } 148 else 149 { 150 // we have a potential match when shift is 0 151 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock); 152 PrefixTableType &element = mHashTable[blockHash]; 153 PrefixTableType::iterator iter = element.begin(); 154 155 while (element.end() != iter) 156 { 157 if (prefixHash == iter->first) 158 { 159 // since prefindex matches, compare target substring with pattern 160 // we know first two characters already match 161 const char* indexTarget = text + index - windowMaxIndex; //+mBlock 162 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock 163 164 while (('\0' != *indexTarget) && ('\0' != *indexPattern)) 165 { 166 // match until we reach end of either string 167 if (*indexTarget == *indexPattern) 168 { 169 // match against chosen case sensitivity 170 ++indexTarget; 171 ++indexPattern; 172 } 173 else 174 break; 175 } 176 // match succeed since we reach the end of the pattern. 177 if ('\0' == *indexPattern) 178 { 179 res.insert(string(mPatterns[iter->second])); 180 ++hits; 181 } 182 }//end if 183 ++iter; 184 }//end while 185 ++index; 186 }//end else 187 }//end while 188 189 return hits; 190 } 191 192 /** 193 * Search 194 */ 195 int WuManber::Search(const string& str, ResultSetType& res) 196 { 197 return Search(str.c_str(), str.length(), res); 198 } 199 200 /** 201 * Search 202 */ 203 int WuManber::Search(const char* text, const int textLength) 204 { 205 //hit count: value to be returned 206 int hits = 0; 207 int index = mMin - 1; // start off by matching end of largest common pattern 208 209 uint32_t blockMaxIndex = mBlock - 1; 210 uint32_t windowMaxIndex = mMin - 1; 211 212 while (index < textLength) 213 { 214 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock); 215 blockHash = blockHash % mTableSize; 216 int shift = mShiftTable[blockHash]; 217 if (shift > 0) 218 { 219 index += shift; 220 } 221 else 222 { 223 // we have a potential match when shift is 0 224 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock); 225 //prefixHash = prefixHash % mTableSize; 226 PrefixTableType &element = mHashTable[blockHash]; 227 PrefixTableType::iterator iter = element.begin(); 228 229 while (element.end() != iter) 230 { 231 if (prefixHash == iter->first) 232 { 233 // since prefindex matches, compare target substring with pattern 234 // we know first two characters already match 235 const char* indexTarget = text + index - windowMaxIndex; //+mBlock 236 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock 237 238 while (('\0' != *indexTarget) && ('\0' != *indexPattern)) 239 { 240 // match until we reach end of either string 241 if (*indexTarget == *indexPattern) 242 { 243 // match against chosen case sensitivity 244 ++indexTarget; 245 ++indexPattern; 246 } 247 else 248 break; 249 } 250 // match succeed since we reach the end of the pattern. 251 if ('\0' == *indexPattern) 252 { 253 ++hits; 254 } 255 }//end if 256 ++iter; 257 }//end while 258 ++index; 259 }//end else 260 }//end while 261 262 return hits; 263 } 264 265 int WuManber::Search(const char* text, const int textLength, MatchPosVector &matchPosVector) 266 { 267 //hit count: value to be returned 268 int hits = 0; 269 int index = mMin - 1; // start off by matching end of largest common pattern 270 271 uint32_t blockMaxIndex = mBlock - 1; 272 uint32_t windowMaxIndex = mMin - 1; 273 274 while (index < textLength) 275 { 276 unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock); 277 blockHash = blockHash % mTableSize; 278 int shift = mShiftTable[blockHash]; 279 if (shift > 0) 280 { 281 index += shift; 282 } 283 else 284 { 285 // we have a potential match when shift is 0 286 unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock); 287 //prefixHash = prefixHash % mTableSize; 288 PrefixTableType &element = mHashTable[blockHash]; 289 PrefixTableType::iterator iter = element.begin(); 290 291 while (element.end() != iter) 292 { 293 if (prefixHash == iter->first) 294 { 295 // since prefindex matches, compare target substring with pattern 296 // we know first two characters already match 297 const char* indexTarget = text + index - windowMaxIndex; //+mBlock 298 const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock 299 300 while (('\0' != *indexTarget) && ('\0' != *indexPattern)) 301 { 302 // match until we reach end of either string 303 if (*indexTarget == *indexPattern) 304 { 305 // match against chosen case sensitivity 306 ++indexTarget; 307 ++indexPattern; 308 } 309 else 310 break; 311 } 312 // match succeed since we reach the end of the pattern. 313 if ('\0' == *indexPattern) 314 { 315 ++hits; 316 matchPosVector.push_back(index); 317 } 318 }//end if 319 ++iter; 320 }//end while 321 ++index; 322 }//end else 323 }//end while 324 325 return hits; 326 } 327 328 int WuManber::Search(const string& str, MatchPosVector &matchPosVector) 329 { 330 return Search(str.c_str(), str.length(), matchPosVector); 331 } 332 333 int WuManber::Search(const string& str) 334 { 335 return Search(str.c_str(), str.length()); 336 }
1 #include <iostream> 2 #include <fstream> 3 #include <string.h> 4 #include <vector> 5 #include <algorithm> 6 //#include "wumanber.h" 7 8 using namespace std; 9 10 11 //WuManber search; 12 13 14 int main() 15 { 16 ifstream readfile; 17 string line; 18 readfile.open("test_wumanber.dat", ios::in); 19 vector<string> pattern; 20 vector<unsigned int> pos; 21 while (getline(readfile, line)) { 22 if (line[0] == 1) { 23 line.erase(0,1); 24 pattern.push_back(line); 25 } 26 } 27 for (vector<string>::iterator it = pattern.begin(); it !=\ 28 pattern.end(); it++) 29 cout << *it << endl; 30 /*search.Init(pattern);*/ 31 //ResultSetType res; 32 //cout << search.Search(target, strlen(target), pos) << endl; 33 /*cout << endl;*/ 34 }