zoukankan      html  css  js  c++  java
  • xxx

     1 #ifndef ALGORITHM_WUMANBER_H
     2 #define ALGORITHM_WUMANBER_H
     3 
     4 #include <vector>
     5 #include <string>
     6 #include <set>
     7 
     8 typedef std::set<std::string> ResultSetType;
     9 typedef std::vector<unsigned int> MatchPosVector;
    10 typedef std::pair<unsigned int, int> PrefixIdPairType;
    11 typedef std::vector<PrefixIdPairType> PrefixTableType;
    12 
    13 class WuManber
    14 {
    15     public:
    16         WuManber();
    17         ~WuManber();
    18         /**
    19          * Init Function
    20          * 
    21          * @param patterns      pattern list to be matched
    22          */
    23         bool Init(const std::vector<std::string>& patterns);
    24 
    25         /** 
    26          * @param text           raw text
    27          * @param textLength     length of text
    28          * @param res            string set containing matched patterns
    29          * 
    30          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    31          */
    32         int Search( const char* text, const int textLength, ResultSetType& res);
    33 
    34         /**
    35          * @param  str           raw text
    36          * @param  res           string set containing matched patterns
    37          *
    38          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    39          */
    40          int Search(const std::string& str, ResultSetType& res);
    41 
    42         /**
    43          * @brief Search text 
    44          *
    45          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    46          */
    47         int Search(const char* text, const int textLength);
    48         
    49         /**
    50          * @param  str           raw text
    51          * param  matchPosVector        vector containing matched patterns postion
    52          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    53          */
    54         int Search(const char* text, const int textLength, MatchPosVector &matchPosVector);
    55         
    56         /**
    57          * param  matchPosVector        vector containing matched patterns postion
    58          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    59          */
    60         int Search(const std::string& str, MatchPosVector &matchPosVector);
    61 
    62         /**
    63          * @brief Search text
    64          *
    65          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
    66          */
    67         int Search(const std::string& str);
    68 
    69     private:
    70         // minmum length of patterns
    71         int32_t mMin;
    72         // SHIFT table
    73         std::vector<int32_t> mShiftTable;
    74         // a combination of HASH and PREFIX table 
    75         std::vector<PrefixTableType> mHashTable;
    76         // patterns
    77         std::vector<std::string> mPatterns;
    78         // size of SHIFT and HASH table
    79         int32_t mTableSize;
    80         // size of block
    81         int32_t mBlock;
    82 };
    83 
    84 #endif
      1 #include <cmath>
      2 #include <iostream>
      3 #include "wumanber.h"
      4 
      5 using namespace std;
      6 
      7 /** 
      8  * @brief   String hash function.
      9  * 
     10  * @param str   the string needed to be hashed
     11  * @param len   length of the substr should be hashed
     12  * 
     13  * @return hash code
     14  */
     15 unsigned int HashCode(const char* str, int len)
     16 {
     17     unsigned int hash = 0;
     18     while (*str && len>0)
     19     {
     20         hash = (*str++) + (hash << 6) + (hash << 16) - hash;
     21         --len;
     22     }
     23     return (hash & 0x7FFFFFFF);
     24 }
     25 
     26 /** 
     27  * @brief constructor 
     28  */
     29 WuManber::WuManber():mMin(0), mTableSize(0), mBlock(3)
     30 {
     31     //VOID
     32 }
     33 
     34 /**
     35  * @brief Init
     36  */
     37 bool WuManber::Init(const vector<string>& patterns)
     38 {
     39     int patternSize = patterns.size();
     40 
     41     //check if no pattern specified
     42     if (patternSize == 0)
     43     {
     44         //cerr << "Error: wumanber init failed because no pattern specified." << endl;
     45         return false;
     46     }
     47     
     48     //caculate the minmum pattern length
     49     mMin = patterns[0].length();
     50     int32_t lenPattern = 0;
     51     for (int i = 0; i < patternSize; ++i) 
     52     {
     53         lenPattern = patterns[i].length();
     54         if (lenPattern < mMin)
     55         {
     56             mMin = lenPattern;
     57         }
     58     }
     59 
     60     //check if mBlock larger than mMin
     61     if (mBlock > mMin)
     62     {
     63         //cerr << "Warning: mBlock is larger than minmum pattern length, reset mBlock to minmum, but it will seriously affect the effiency." << endl;
     64         mBlock = mMin;
     65     }
     66 
     67     //choose a suitable mTableSize for SHIFT, HASH table
     68     int32_t primes[6] = {1003, 10007, 100003, 1000003, 10000019, 100000007};
     69     vector<int32_t> primeList(&primes[0], &primes[6]);
     70 
     71     int32_t threshold = 10 * mMin;
     72     for (size_t i = 0; i < primeList.size(); ++i)
     73     {
     74         if (primeList[i] > patternSize && primeList[i] / patternSize > threshold)
     75         {
     76             mTableSize = primeList[i];
     77             break;
     78         }
     79     }
     80     cout << mTableSize << " " << mBlock << " " << mMin << endl;
     81     //if size of patternList is huge.
     82     if (0 == mTableSize)
     83     {
     84         //cerr << "Warning: amount of pattern is very large, will cost a great amount of memory." << endl;
     85         mTableSize = primeList[5];
     86     }
     87 
     88     //construct ShiftTable and HashTable, and set default value for SHIFT table
     89     mPatterns = patterns;
     90     mHashTable.resize(mTableSize);
     91     // default value is m-mBlock+1 for shift
     92     int32_t defaultValue = mMin - mBlock + 1;
     93     mShiftTable.resize(mTableSize, defaultValue);
     94 
     95     //loop through patterns
     96     for (int id = 0; id < patternSize; ++id) 
     97     { 
     98         // loop through each pattern from right to left
     99         for (int index = mMin; index >= mBlock; --index)
    100         {
    101             unsigned int hash = HashCode(patterns[id].c_str() + index - mBlock, mBlock) % mTableSize;
    102             if (mShiftTable[hash] > (mMin - index))
    103             {
    104                 mShiftTable[hash]  = mMin - index;
    105             }
    106             if (index == mMin)
    107             {
    108                 unsigned int prefixHash = HashCode(patterns[id].c_str(), mBlock);
    109                 mHashTable[hash].push_back(make_pair(prefixHash, id));
    110             }
    111         }
    112     }
    113     cout << "Term number : " <<  mPatterns.size() << endl;
    114     return true;
    115 }
    116 
    117 /** 
    118  * @brief destructor
    119  */
    120 WuManber::~WuManber()
    121 {
    122     //VOID
    123 }
    124 
    125 
    126 /**
    127  * @public
    128  * @brief search multiple pattern in text at one time
    129  */
    130 int WuManber::Search(const char* text, const int textLength, ResultSetType& res)
    131 {
    132     //hit count: value to be returned
    133     int hits = 0;
    134     int32_t index = mMin - 1; // start off by matching end of largest common pattern
    135     
    136     int32_t blockMaxIndex = mBlock - 1;
    137     int32_t windowMaxIndex = mMin - 1;
    138     
    139     while (index < textLength)
    140     {
    141         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
    142         blockHash = blockHash % mTableSize;
    143         int shift = mShiftTable[blockHash];
    144         if (shift > 0)
    145         {
    146             index += shift;
    147         }
    148         else
    149         {  
    150             // we have a potential match when shift is 0
    151             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
    152             PrefixTableType &element = mHashTable[blockHash];
    153             PrefixTableType::iterator iter = element.begin();
    154 
    155             while (element.end() != iter)
    156             {
    157                 if (prefixHash == iter->first)
    158                 {   
    159                     // since prefindex matches, compare target substring with pattern
    160                     // we know first two characters already match
    161                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
    162                     const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock
    163                     
    164                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
    165                     {
    166                         // match until we reach end of either string
    167                         if (*indexTarget == *indexPattern)
    168                         {
    169                             // match against chosen case sensitivity
    170                             ++indexTarget;
    171                             ++indexPattern;
    172                         }
    173                         else
    174                             break;
    175                     }
    176                     // match succeed since we reach the end of the pattern.
    177                     if ('\0' == *indexPattern)
    178                     {
    179                         res.insert(string(mPatterns[iter->second]));
    180                         ++hits;
    181                     }
    182                 }//end if
    183                 ++iter;
    184             }//end while
    185             ++index;
    186         }//end else
    187     }//end while
    188 
    189     return hits;
    190 }
    191 
    192 /**
    193  * Search
    194  */
    195 int WuManber::Search(const string& str, ResultSetType& res)
    196 {
    197     return Search(str.c_str(), str.length(), res);
    198 }
    199 
    200 /**
    201  * Search
    202  */
    203 int WuManber::Search(const char* text, const int textLength)
    204 {
    205     //hit count: value to be returned
    206     int hits = 0;
    207     int index = mMin - 1; // start off by matching end of largest common pattern
    208 
    209     uint32_t blockMaxIndex = mBlock - 1;
    210     uint32_t windowMaxIndex = mMin - 1;
    211 
    212     while (index < textLength)
    213     {
    214         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
    215         blockHash = blockHash % mTableSize;
    216         int shift = mShiftTable[blockHash];
    217         if (shift > 0)
    218         {
    219             index += shift;
    220         }
    221         else
    222         {
    223             // we have a potential match when shift is 0
    224             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
    225             //prefixHash = prefixHash % mTableSize;
    226             PrefixTableType &element = mHashTable[blockHash];
    227             PrefixTableType::iterator iter = element.begin();
    228 
    229             while (element.end() != iter)
    230             {
    231                 if (prefixHash == iter->first)
    232                 {
    233                     // since prefindex matches, compare target substring with pattern
    234                     // we know first two characters already match
    235                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
    236                     const char* indexPattern = mPatterns[iter->second].c_str();  //+mBlock
    237 
    238                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
    239                     {
    240                         // match until we reach end of either string
    241                         if (*indexTarget == *indexPattern)
    242                         {
    243                             // match against chosen case sensitivity
    244                             ++indexTarget;
    245                             ++indexPattern;
    246                         }
    247                         else
    248                             break;
    249                     }
    250                     // match succeed since we reach the end of the pattern.
    251                     if ('\0' == *indexPattern)
    252                     {
    253                         ++hits;
    254                     }
    255                 }//end if
    256                 ++iter;
    257             }//end while
    258             ++index;
    259         }//end else
    260     }//end while
    261 
    262     return hits;
    263 }
    264 
    265 int WuManber::Search(const char* text, const int textLength, MatchPosVector &matchPosVector)
    266 {
    267     //hit count: value to be returned
    268     int hits = 0;
    269     int index = mMin - 1; // start off by matching end of largest common pattern
    270 
    271     uint32_t blockMaxIndex = mBlock - 1;
    272     uint32_t windowMaxIndex = mMin - 1;
    273 
    274     while (index < textLength)
    275     {
    276         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
    277         blockHash = blockHash % mTableSize;
    278         int shift = mShiftTable[blockHash];
    279         if (shift > 0)
    280         {
    281             index += shift;
    282         }
    283         else
    284         {
    285             // we have a potential match when shift is 0
    286             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
    287             //prefixHash = prefixHash % mTableSize;
    288             PrefixTableType &element = mHashTable[blockHash];
    289             PrefixTableType::iterator iter = element.begin();
    290 
    291             while (element.end() != iter)
    292             {
    293                 if (prefixHash == iter->first)
    294                 {
    295                     // since prefindex matches, compare target substring with pattern
    296                     // we know first two characters already match
    297                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
    298                     const char* indexPattern = mPatterns[iter->second].c_str();  //+mBlock
    299 
    300                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
    301                     {
    302                         // match until we reach end of either string
    303                         if (*indexTarget == *indexPattern)
    304                         {
    305                             // match against chosen case sensitivity
    306                             ++indexTarget;
    307                             ++indexPattern;
    308                         }
    309                         else
    310                             break;
    311                     }
    312                     // match succeed since we reach the end of the pattern.
    313                     if ('\0' == *indexPattern)
    314                     {
    315                         ++hits;
    316                         matchPosVector.push_back(index);
    317                     }
    318                 }//end if
    319                 ++iter;
    320             }//end while
    321             ++index;
    322         }//end else
    323     }//end while
    324 
    325     return hits;
    326 }
    327 
    328 int WuManber::Search(const string& str, MatchPosVector &matchPosVector)
    329 {
    330     return Search(str.c_str(), str.length(), matchPosVector);
    331 }
    332 
    333 int WuManber::Search(const string& str)
    334 {
    335     return Search(str.c_str(), str.length());
    336 }
     1 #include <iostream>
     2 #include <fstream>
     3 #include <string.h>
     4 #include <vector>
     5 #include <algorithm>
     6 //#include "wumanber.h"
     7 
     8 using namespace std;
     9 
    10 
    11 //WuManber search;
    12 
    13 
    14 int main()
    15 {
    16     ifstream readfile;
    17     string line;
    18     readfile.open("test_wumanber.dat", ios::in);
    19     vector<string> pattern;
    20     vector<unsigned int> pos;
    21     while (getline(readfile, line)) {
    22         if (line[0] == 1) {
    23             line.erase(0,1);
    24             pattern.push_back(line);
    25         }
    26     }
    27     for (vector<string>::iterator it = pattern.begin(); it !=\
    28             pattern.end(); it++)
    29         cout << *it << endl;
    30     /*search.Init(pattern);*/
    31     //ResultSetType res;
    32     //cout << search.Search(target, strlen(target), pos) << endl;
    33     /*cout << endl;*/
    34 }
  • 相关阅读:
    linux 终端相关
    「CF10D」LCIS
    「SP1043」GSS1
    「NOI2009」二叉查找树
    「CF650E」Clockwork Bomb
    「UVA10559」Blocks
    「LuoguP3979」遥远的国度
    「SDOI2015」寻宝游戏
    「CF741D」Arpa’s letter-marked tree and Mehrdad’s Dokhtar-kosh paths
    「CF600E」Lomsat gelral
  • 原文地址:https://www.cnblogs.com/hengli/p/2974072.html
Copyright © 2011-2022 走看看