zoukankan      html  css  js  c++  java
  • C++里创建 Trie字典树(中文词典)(三)(联想)

      萌新做词典第三篇,做得不好,还请指正,谢谢大佬!

      今天把词典的联想做好了,也是比较low的,还改了之前的查询、遍历等代码。  Orz

      一样地先放上运行结果:

     1 test1
     2 ID : 2    char : 件    word : 编程软件
     3 ID : 3    char : 习    word : 编程学习
     4 ID : 4    char : 站    word : 编程学习网站
     5 ID : 1    char : 门    word : 编程入门
     6 
     7 test2
     8 ID : 5    char : 练    word : 编程训练
     9 ID : 1    char : 门    word : 编程入门
    10 ID : 3    char : 习    word : 编程学习
    11 ID : 4    char : 站    word : 编程学习网站
    12 ID : 2    char : 件    word : 编程软件
    13 find ID : 3    word : 编程学习
    14 
    15 associate "编程" : 
    16 find!
    17 训练
    18 入门
    19 学习
    20 学习网站
    21 软件

      测试用的test.cc

     1 #include "Dictionary.h"
     2 #include <iostream>
     3 #include <string>
     4 #include <vector>
     5 using std::cout;
     6 using std::endl;
     7 using std::string;
     8 using std::vector;
     9 
    10 int test1()
    11 {
    12     ccx::Dictionary words;
    13     string word1 = "编程入门";    
    14     string word2 = "编程软件";    
    15     string word3 = "编程学习";    
    16     string word4 = "编程学习网站";    
    17     
    18     words.push(word1);    
    19     words.push(word2);    
    20     words.push(word3);    
    21     words.push(word4);    
    22 
    23     words.resetIt();
    24     
    25     while(!words.isEnd())
    26     {
    27         cout << "ID : " << words.getCurWordId() 
    28             << "	char : " << words.getCurChar() 
    29             << "	word : " << words.getCurWord() << endl;
    30         words.next();
    31     }
    32     
    33     words.leading_out();
    34     return 0;
    35 }
    36 
    37 
    38 int test2()
    39 {
    40     ccx::Dictionary words;
    41     words.leading_in();
    42 
    43 
    44     string word("编程训练");
    45     words.push(word);
    46     words.resetIt();
    47 
    48     while(!words.isEnd())
    49     {
    50         cout << "ID : " << words.getCurWordId() 
    51             << "	char : " << words.getCurChar() 
    52             << "	word : " << words.getCurWord() << endl;
    53         words.next();
    54     }
    55     string tmp = "编程学习";    
    56     int id = words.search(tmp);
    57     if(-1 == id)
    58     {
    59         cout << "no such word like "" << tmp << """ << endl;
    60     }else{
    61         cout << "find ID : " << id
    62             << "	word : " << tmp << endl;
    63     }
    64 
    65     cout << endl;
    66     cout << "associate "编程" : " << endl;
    67 
    68     vector<string> data;    
    69     string temp = "编程";
    70     
    71     if(words.associate(temp, data))
    72     {
    73         cout << "find!" << endl;
    74         for(auto & elem : data)
    75         {
    76             cout << elem << endl;
    77         }
    78     }else{
    79         cout << "can't find" << endl;
    80     }
    81 
    82 
    83     return 0;
    84 }
    85 
    86 int main()
    87 {
    88     cout << "test1" << endl;
    89     test1();
    90     cout << endl;
    91     cout << "test2" << endl;
    92     test2();
    93     cout << endl;
    94 }
    View Code

      test1不变,test2 在导入后再插入一个词“编程训练”,发现ID是正常的。

      然后在test2最后调用联想函数,传入“编程”,能够正常传出所有的字符串。

      在做这个的时候,一开始想的很简单,就是拿传入的词去树中查找,找到最后一人字对应的节点,然后以那个节点为根进行遍历。然后就开开心心地去写了,结果写一部分就要对之前的代码进行更改,于是,这个接口越来越“肥”了:

    Dictionary.h

     1 #ifndef __DICTIONARY_H__
     2 #define __DICTIONARY_H__
     3 
     4 #include "DictionaryData.h"
     5 #include "DictionaryConf.h"
     6 
     7 #include <memory>
     8 #include <vector>
     9 #include <list>
    10 
    11 namespace ccx{
    12 
    13 using std::shared_ptr;
    14 using std::vector;
    15 using std::list;
    16 
    17 class Dictionary
    18 {
    19     typedef unordered_map<string, pDictElem>::iterator WordIt;
    20     public:
    21         Dictionary();
    22         void push(const string & word);
    23         void push(vector<string> & words);
    24         int search(const string & word);
    25         bool associate(const string & word, vector<string> & data);
    26     private:
    27         void AddWord(const string & word, int wordId);
    28         void splitWord(const string & word, vector<string> & characters);//把词拆成字
    29         int search(vector<string> & data, pDictElem & pcur);
    30         pDictElem _dictionary;
    31         DictionaryConf _conf;    
    32 
    33 //遍历
    34     public:
    35         string getCurChar();
    36         string getCurWord();
    37         int getCurWordId();
    38         bool isEnd();
    39         void resetIt();
    40         void next();
    41     private:
    42         void resetPoint(pDictElem pcur);
    43         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
    44         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
    45         string getCurWord(list<WordIt> & stackWord);
    46         
    47         pDictElem _pcur;
    48         WordIt _itcur;
    49         
    50 //用list实现栈,遍历时方便
    51         list<WordIt> _stackWord;
    52         list<pDictElem> _stackDict;
    53 
    54 //导入导出
    55     public:
    56         void leading_in();
    57         void leading_out();
    58 };
    59 
    60 }
    61 
    62 #endif

      对几个原有的函数进行了重载,主要是为了能够复用一些代码,但是又想不到合适的新的函数名(英语不太好Orz)。

      首先,是要能够查找并返回新的根结点,于是对search进行修改:

     1 int Dictionary::search(vector<string> & characters, pDictElem & root)
     2 {
     3     vector<string>::iterator it_char;
     4     it_char = characters.begin();    
     5     root = _dictionary;
     6     int i = 0;
     7     for(; it_char != characters.end(); ++it_char, ++i)
     8     {
     9         WordIt it_word;
    10         it_word = root->_words.find(*it_char);
    11         
    12         if(it_word == root->_words.end())
    13         {
    14             break;
    15         }else{
    16             root = it_word->second;
    17         }
    18     }
    19     return i;
    20 }

      形参第一项是分解后的字集,第二项是一个智能指针,指向某个节点。这里返回值改为了字集的第几项,有两个目的:

      1、插入函数中可以方便地知道下一个要插入的是哪个字符

      2、联想函数中可以判断字集中的字是否都存在于词典中

      3、好吧,我没想到其它好办法,而且当时是想到上面两点就这么做了,后来发现,插入部分的代码根本就不用改

      然后是重载search:

     1 int Dictionary::search(const string & word)
     2 {
     3     pDictElem root = _dictionary;
     4     vector<string> temp;
     5     splitWord(word, temp);
     6     
     7     int ret = search(temp, root);
     8     int size = temp.size();
     9     if(ret != size)
    10     {
    11         return -1;
    12     }
    13     return root->_wordId;
    14 }

      在这里对字进行分解,并定义一个临时的根结点,这样做的目的是为了保护private中的根结点,并且可以在多线程环境中互不干扰。

      能够找到“新的根”后,就要对它进行遍历了。如果只有单一线程或进程来使用它,这里可以直接把resetPoint(原来的)修改一下,设置指定结点就可以了:

     1 void Dictionary::resetPoint(pDictElem pcur)
     2 {
     3     _pcur = pcur;
     4     if(_stackDict.size())
     5     {
     6         _stackDict.clear();
     7     }
     8     if(_stackWord.size())
     9     {
    10         _stackWord.clear();
    11     }
    12     next();
    13 }

      如果是这样,那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中,于是我对遍历部分的函数进行了修改:

     1 void Dictionary::next()
     2 {
     3     next(_pcur, _stackWord, _stackDict);
     4 }
     5 
     6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
     7 {
     8     while(pcur)
     9     {
    10         nextWord(pcur, stackWord, stackDict);
    11         if(!pcur || pcur->_wordId)
    12         {
    13             break;
    14         }
    15     }
    16 }
    17 
    18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    19 {
    20     if(pcur)
    21     {
    22         if(pcur->_words.size())
    23         {
    24             stackDict.push_back(pcur);
    25             stackWord.push_back(pcur->_words.begin());
    26             pcur = stackWord.back()->second;
    27         }else{
    28             ++(stackWord.back());
    29         }
    30         while(stackWord.back() == stackDict.back()->_words.end())
    31         {
    32             stackDict.pop_back();
    33             stackWord.pop_back();
    34             if(!stackDict.size())
    35             {
    36                 pcur = NULL;
    37             }
    38             ++(stackWord.back());
    39         }
    40         if(pcur)
    41         {
    42             pcur = stackWord.back()->second;
    43         }    
    44     }
    45 }

      next部分,改为传入参数,这样就可以在associate里定义临时的栈和智能指针等,遍历的时候与其它工作并没有任何关系。

      同样地,getWord也要做相同的更改:

     1 string Dictionary::getCurWord()
     2 {
     3     return getCurWord(_stackWord);
     4 }
     5 
     6 string Dictionary::getCurWord(list<WordIt> & stackWord)
     7 {
     8     string temp;
     9     list<WordIt>::iterator it_word;    
    10     it_word = stackWord.begin();    
    11 
    12     for(; it_word != stackWord.end(); ++it_word)
    13     {
    14         temp += (*it_word)->first;
    15     }
    16     return temp;
    17 }

      当然了,对外提供的接口都是不要传参的,其它的只能在内部使用,于是放入了private区。

      终于可以开始写联想了0.0

     1 bool Dictionary::associate(const string & word, vector<string> & data)
     2 {
     3     pDictElem root = _dictionary;
     4     vector<string> temp;
     5     splitWord(word, temp);
     6     
     7     int ret = search(temp, root);
     8     int size = temp.size();
     9     if(ret != size)
    10     {
    11         return false;
    12     }
    13     
    14     list<WordIt> stackWord;
    15     list<pDictElem> stackDict;
    16     next(root, stackWord, stackDict);
    17     while(root)
    18     {
    19         string temp = getCurWord(stackWord);
    20         data.push_back(temp);    
    21         next(root, stackWord, stackDict);
    22     }
    23     
    24     if(!data.size())
    25     {
    26         return false;
    27     }
    28     return true;
    29 }

      返回bool类型,可以方便地判断是否联想成功,即以传入的词做为前缀,能否找到剩余部分(词典里有存)。于是乎,一个渣渣型号的词典就做好啦~~~

    Dictionary.cc

      1 #include "Dictionary.h"
      2 #include <iostream>
      3 #include <fstream>
      4 #include <string>
      5 #include <json/json.h>
      6 
      7 namespace ccx{
      8 
      9 using std::endl;
     10 using std::cout;
     11 using std::pair;
     12 using std::ofstream;
     13 using std::ifstream;
     14 
     15 Dictionary::Dictionary()
     16 : _dictionary(new DictElem)
     17 , _conf()
     18 {
     19     _dictionary->_wordId = 0;
     20     _pcur = _dictionary;
     21 }
     22 
     23 void Dictionary::splitWord(const string & word, vector<string> & characters)
     24 {
     25     int num = word.size();
     26     int i = 0;
     27     while(i < num)
     28     {
     29         int size = 1;
     30         if(word[i] & 0x80)
     31         {
     32             char temp = word[i];
     33             temp <<= 1;
     34             do{
     35                 temp <<= 1;
     36                 ++size;
     37             }while(temp & 0x80);
     38         }
     39         string subWord;
     40         subWord = word.substr(i, size);
     41         characters.push_back(subWord);
     42         i += size;
     43     }
     44 }
     45 
     46 void Dictionary::AddWord(const string & word, int wordId)
     47 {
     48     vector<string> characters;
     49     splitWord(word, characters);
     50     
     51     vector<string>::iterator it_char;
     52     it_char = characters.begin();    
     53     pDictElem root;
     54     root = _dictionary;
     55     for(; it_char != characters.end(); ++it_char)
     56     {
     57         WordIt it_word;
     58         it_word = root->_words.find(*it_char);
     59         
     60         if(it_word == root->_words.end())
     61         {
     62             pair<string, pDictElem> temp;
     63             temp.first = *it_char;
     64             pDictElem dictemp(new DictElem);
     65             dictemp->_word = *it_char;
     66             dictemp->_wordId = 0;
     67             temp.second = dictemp;
     68             root->_words.insert(temp);
     69             root = dictemp;
     70         }else{
     71             root = it_word->second;
     72         }
     73     }
     74     if(!root->_wordId)
     75     {
     76         root->_wordId = wordId;
     77     }
     78 }
     79 
     80 void Dictionary::push(const string & word)
     81 {
     82     ++(_dictionary->_wordId);
     83     AddWord(word, _dictionary->_wordId);
     84 }
     85 
     86 void Dictionary::push(vector<string> & words)
     87 {
     88     int size = words.size();
     89     for(int i = 0; i < size; ++i)
     90     {
     91         push(words[i]);
     92     }
     93 }
     94 
     95 int Dictionary::search(const string & word)
     96 {
     97     pDictElem root = _dictionary;
     98     vector<string> temp;
     99     splitWord(word, temp);
    100     
    101     int ret = search(temp, root);
    102     int size = temp.size();
    103     if(ret != size)
    104     {
    105         return -1;
    106     }
    107     return root->_wordId;
    108 }
    109 
    110 int Dictionary::search(vector<string> & characters, pDictElem & root)
    111 {
    112     vector<string>::iterator it_char;
    113     it_char = characters.begin();    
    114     root = _dictionary;
    115     int i = 0;
    116     for(; it_char != characters.end(); ++it_char, ++i)
    117     {
    118         WordIt it_word;
    119         it_word = root->_words.find(*it_char);
    120         
    121         if(it_word == root->_words.end())
    122         {
    123             break;
    124         }else{
    125             root = it_word->second;
    126         }
    127     }
    128     return i;
    129 }
    130 
    131 bool Dictionary::associate(const string & word, vector<string> & data)
    132 {
    133     pDictElem root = _dictionary;
    134     vector<string> temp;
    135     splitWord(word, temp);
    136     
    137     int ret = search(temp, root);
    138     int size = temp.size();
    139     if(ret != size)
    140     {
    141         return false;
    142     }
    143     
    144     list<WordIt> stackWord;
    145     list<pDictElem> stackDict;
    146     next(root, stackWord, stackDict);
    147     while(root)
    148     {
    149         string temp = getCurWord(stackWord);
    150         data.push_back(temp);    
    151         next(root, stackWord, stackDict);
    152     }
    153     
    154     if(!data.size())
    155     {
    156         return false;
    157     }
    158     return true;
    159 }
    160 
    161 //遍历用
    162 
    163 void Dictionary::resetPoint(pDictElem pcur)
    164 {
    165     _pcur = pcur;
    166     if(_stackDict.size())
    167     {
    168         _stackDict.clear();
    169     }
    170     if(_stackWord.size())
    171     {
    172         _stackWord.clear();
    173     }
    174     next();
    175 }
    176 
    177 void Dictionary::resetIt()
    178 {
    179     resetPoint(_dictionary);
    180 }
    181 
    182 void Dictionary::next()
    183 {
    184     next(_pcur, _stackWord, _stackDict);
    185 }
    186 
    187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    188 {
    189     while(pcur)
    190     {
    191         nextWord(pcur, stackWord, stackDict);
    192         if(!pcur || pcur->_wordId)
    193         {
    194             break;
    195         }
    196     }
    197 }
    198 
    199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    200 {
    201     if(pcur)
    202     {
    203         if(pcur->_words.size())
    204         {
    205             stackDict.push_back(pcur);
    206             stackWord.push_back(pcur->_words.begin());
    207             pcur = stackWord.back()->second;
    208         }else{
    209             ++(stackWord.back());
    210         }
    211         while(stackWord.back() == stackDict.back()->_words.end())
    212         {
    213             stackDict.pop_back();
    214             stackWord.pop_back();
    215             if(!stackDict.size())
    216             {
    217                 pcur = NULL;
    218             }
    219             ++(stackWord.back());
    220         }
    221         if(pcur)
    222         {
    223             pcur = stackWord.back()->second;
    224         }    
    225     }
    226 }
    227 
    228 string Dictionary::getCurChar()
    229 {
    230     return _pcur->_word;
    231 }
    232 
    233 int Dictionary::getCurWordId()
    234 {
    235     return _pcur->_wordId;
    236 }
    237 
    238 string Dictionary::getCurWord()
    239 {
    240     return getCurWord(_stackWord);
    241 }
    242 
    243 string Dictionary::getCurWord(list<WordIt> & stackWord)
    244 {
    245     string temp;
    246     list<WordIt>::iterator it_word;    
    247     it_word = stackWord.begin();    
    248 
    249     for(; it_word != stackWord.end(); ++it_word)
    250     {
    251         temp += (*it_word)->first;
    252     }
    253     return temp;
    254 }
    255 
    256 bool Dictionary::isEnd()
    257 {
    258     return _pcur == NULL;
    259 }
    260 
    261 void Dictionary::leading_in()//导入,失败没必要退出程序
    262 {
    263     ifstream ifs;
    264     const char * path = _conf.getDictionaryPath().c_str();
    265     ifs.open(path);
    266     if(!ifs.good())
    267     {
    268         cout << "open Dictionary.json error(leading_in)" << endl;
    269     }else{
    270         Json::Value root;
    271         Json::Reader reader;
    272         
    273         if(!reader.parse(ifs, root, false))
    274         {
    275             cout << "json read Dictionary.json error" << endl;
    276         }else{
    277             int size = root.size();
    278             for(int i = 0; i < size; ++i)
    279             {
    280                 string word = root[i]["Word"].asString();
    281                 int wordId = root[i]["WordId"].asInt();
    282                 AddWord(word, wordId);
    283                 ++(_dictionary->_wordId);
    284             }
    285         }
    286     }
    287 }
    288 
    289 void Dictionary::leading_out()
    290 {
    291     Json::Value root;
    292     Json::FastWriter writer;
    293 
    294     resetIt();
    295     
    296     while(!isEnd())
    297     {
    298         Json::Value elem;
    299         elem["Word"] = getCurWord();
    300         elem["WordId"] = getCurWordId();
    301         root.append(elem);
    302         next();
    303     }
    304 
    305     string words;
    306     words = writer.write(root);
    307     
    308     ofstream ofs;
    309     const char * path = _conf.getDictionaryPath().c_str();
    310     ofs.open(path);
    311     if(!ofs.good())
    312     {
    313         cout << "open Dictionary.json error(leading_out)" << endl;
    314         ofs.open("Dictionary.tmp");
    315         if(!ofs.good())
    316         {
    317             exit(EXIT_FAILURE);
    318         }
    319     }
    320     
    321     ofs << words;
    322     ofs.close();
    323 }
    324 
    325 }
    View Code
  • 相关阅读:
    Python并发(一)
    Python协程详解(二)
    Python协程详解(一)
    Python装饰器
    ●BZOJ 3676 [Apio2014]回文串
    ●POJ 3974 Palindrome(Manacher)
    ●BZOJ 1692 [Usaco2007 Dec]队列变换
    ●BZOJ 4698 Sdoi2008 Sandy的卡片
    ●BZOJ 4516 [Sdoi2016]生成魔咒
    ●BZOJ 3238 [Ahoi2013]差异
  • 原文地址:https://www.cnblogs.com/chinxi/p/6134053.html
Copyright © 2011-2022 走看看