zoukankan      html  css  js  c++  java
  • C++里创建 Trie字典树(中文词典)(三)(联想)

      萌新做词典第三篇,做得不好,还请指正,谢谢大佬!

      今天把词典的联想做好了,也是比较low的,还改了之前的查询、遍历等代码。  Orz

      一样地先放上运行结果:

     1 test1
     2 ID : 2    char : 件    word : 编程软件
     3 ID : 3    char : 习    word : 编程学习
     4 ID : 4    char : 站    word : 编程学习网站
     5 ID : 1    char : 门    word : 编程入门
     6 
     7 test2
     8 ID : 5    char : 练    word : 编程训练
     9 ID : 1    char : 门    word : 编程入门
    10 ID : 3    char : 习    word : 编程学习
    11 ID : 4    char : 站    word : 编程学习网站
    12 ID : 2    char : 件    word : 编程软件
    13 find ID : 3    word : 编程学习
    14 
    15 associate "编程" : 
    16 find!
    17 训练
    18 入门
    19 学习
    20 学习网站
    21 软件

      测试用的test.cc

     1 #include "Dictionary.h"
     2 #include <iostream>
     3 #include <string>
     4 #include <vector>
     5 using std::cout;
     6 using std::endl;
     7 using std::string;
     8 using std::vector;
     9 
    10 int test1()
    11 {
    12     ccx::Dictionary words;
    13     string word1 = "编程入门";    
    14     string word2 = "编程软件";    
    15     string word3 = "编程学习";    
    16     string word4 = "编程学习网站";    
    17     
    18     words.push(word1);    
    19     words.push(word2);    
    20     words.push(word3);    
    21     words.push(word4);    
    22 
    23     words.resetIt();
    24     
    25     while(!words.isEnd())
    26     {
    27         cout << "ID : " << words.getCurWordId() 
    28             << "	char : " << words.getCurChar() 
    29             << "	word : " << words.getCurWord() << endl;
    30         words.next();
    31     }
    32     
    33     words.leading_out();
    34     return 0;
    35 }
    36 
    37 
    38 int test2()
    39 {
    40     ccx::Dictionary words;
    41     words.leading_in();
    42 
    43 
    44     string word("编程训练");
    45     words.push(word);
    46     words.resetIt();
    47 
    48     while(!words.isEnd())
    49     {
    50         cout << "ID : " << words.getCurWordId() 
    51             << "	char : " << words.getCurChar() 
    52             << "	word : " << words.getCurWord() << endl;
    53         words.next();
    54     }
    55     string tmp = "编程学习";    
    56     int id = words.search(tmp);
    57     if(-1 == id)
    58     {
    59         cout << "no such word like "" << tmp << """ << endl;
    60     }else{
    61         cout << "find ID : " << id
    62             << "	word : " << tmp << endl;
    63     }
    64 
    65     cout << endl;
    66     cout << "associate "编程" : " << endl;
    67 
    68     vector<string> data;    
    69     string temp = "编程";
    70     
    71     if(words.associate(temp, data))
    72     {
    73         cout << "find!" << endl;
    74         for(auto & elem : data)
    75         {
    76             cout << elem << endl;
    77         }
    78     }else{
    79         cout << "can't find" << endl;
    80     }
    81 
    82 
    83     return 0;
    84 }
    85 
    86 int main()
    87 {
    88     cout << "test1" << endl;
    89     test1();
    90     cout << endl;
    91     cout << "test2" << endl;
    92     test2();
    93     cout << endl;
    94 }
    View Code

      test1不变,test2 在导入后再插入一个词“编程训练”,发现ID是正常的。

      然后在test2最后调用联想函数,传入“编程”,能够正常传出所有的字符串。

      在做这个的时候,一开始想的很简单,就是拿传入的词去树中查找,找到最后一人字对应的节点,然后以那个节点为根进行遍历。然后就开开心心地去写了,结果写一部分就要对之前的代码进行更改,于是,这个接口越来越“肥”了:

    Dictionary.h

     1 #ifndef __DICTIONARY_H__
     2 #define __DICTIONARY_H__
     3 
     4 #include "DictionaryData.h"
     5 #include "DictionaryConf.h"
     6 
     7 #include <memory>
     8 #include <vector>
     9 #include <list>
    10 
    11 namespace ccx{
    12 
    13 using std::shared_ptr;
    14 using std::vector;
    15 using std::list;
    16 
    17 class Dictionary
    18 {
    19     typedef unordered_map<string, pDictElem>::iterator WordIt;
    20     public:
    21         Dictionary();
    22         void push(const string & word);
    23         void push(vector<string> & words);
    24         int search(const string & word);
    25         bool associate(const string & word, vector<string> & data);
    26     private:
    27         void AddWord(const string & word, int wordId);
    28         void splitWord(const string & word, vector<string> & characters);//把词拆成字
    29         int search(vector<string> & data, pDictElem & pcur);
    30         pDictElem _dictionary;
    31         DictionaryConf _conf;    
    32 
    33 //遍历
    34     public:
    35         string getCurChar();
    36         string getCurWord();
    37         int getCurWordId();
    38         bool isEnd();
    39         void resetIt();
    40         void next();
    41     private:
    42         void resetPoint(pDictElem pcur);
    43         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
    44         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
    45         string getCurWord(list<WordIt> & stackWord);
    46         
    47         pDictElem _pcur;
    48         WordIt _itcur;
    49         
    50 //用list实现栈,遍历时方便
    51         list<WordIt> _stackWord;
    52         list<pDictElem> _stackDict;
    53 
    54 //导入导出
    55     public:
    56         void leading_in();
    57         void leading_out();
    58 };
    59 
    60 }
    61 
    62 #endif

      对几个原有的函数进行了重载,主要是为了能够复用一些代码,但是又想不到合适的新的函数名(英语不太好Orz)。

      首先,是要能够查找并返回新的根结点,于是对search进行修改:

     1 int Dictionary::search(vector<string> & characters, pDictElem & root)
     2 {
     3     vector<string>::iterator it_char;
     4     it_char = characters.begin();    
     5     root = _dictionary;
     6     int i = 0;
     7     for(; it_char != characters.end(); ++it_char, ++i)
     8     {
     9         WordIt it_word;
    10         it_word = root->_words.find(*it_char);
    11         
    12         if(it_word == root->_words.end())
    13         {
    14             break;
    15         }else{
    16             root = it_word->second;
    17         }
    18     }
    19     return i;
    20 }

      形参第一项是分解后的字集,第二项是一个智能指针,指向某个节点。这里返回值改为了字集的第几项,有两个目的:

      1、插入函数中可以方便地知道下一个要插入的是哪个字符

      2、联想函数中可以判断字集中的字是否都存在于词典中

      3、好吧,我没想到其它好办法,而且当时是想到上面两点就这么做了,后来发现,插入部分的代码根本就不用改

      然后是重载search:

     1 int Dictionary::search(const string & word)
     2 {
     3     pDictElem root = _dictionary;
     4     vector<string> temp;
     5     splitWord(word, temp);
     6     
     7     int ret = search(temp, root);
     8     int size = temp.size();
     9     if(ret != size)
    10     {
    11         return -1;
    12     }
    13     return root->_wordId;
    14 }

      在这里对字进行分解,并定义一个临时的根结点,这样做的目的是为了保护private中的根结点,并且可以在多线程环境中互不干扰。

      能够找到“新的根”后,就要对它进行遍历了。如果只有单一线程或进程来使用它,这里可以直接把resetPoint(原来的)修改一下,设置指定结点就可以了:

     1 void Dictionary::resetPoint(pDictElem pcur)
     2 {
     3     _pcur = pcur;
     4     if(_stackDict.size())
     5     {
     6         _stackDict.clear();
     7     }
     8     if(_stackWord.size())
     9     {
    10         _stackWord.clear();
    11     }
    12     next();
    13 }

      如果是这样,那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中,于是我对遍历部分的函数进行了修改:

     1 void Dictionary::next()
     2 {
     3     next(_pcur, _stackWord, _stackDict);
     4 }
     5 
     6 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
     7 {
     8     while(pcur)
     9     {
    10         nextWord(pcur, stackWord, stackDict);
    11         if(!pcur || pcur->_wordId)
    12         {
    13             break;
    14         }
    15     }
    16 }
    17 
    18 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    19 {
    20     if(pcur)
    21     {
    22         if(pcur->_words.size())
    23         {
    24             stackDict.push_back(pcur);
    25             stackWord.push_back(pcur->_words.begin());
    26             pcur = stackWord.back()->second;
    27         }else{
    28             ++(stackWord.back());
    29         }
    30         while(stackWord.back() == stackDict.back()->_words.end())
    31         {
    32             stackDict.pop_back();
    33             stackWord.pop_back();
    34             if(!stackDict.size())
    35             {
    36                 pcur = NULL;
    37             }
    38             ++(stackWord.back());
    39         }
    40         if(pcur)
    41         {
    42             pcur = stackWord.back()->second;
    43         }    
    44     }
    45 }

      next部分,改为传入参数,这样就可以在associate里定义临时的栈和智能指针等,遍历的时候与其它工作并没有任何关系。

      同样地,getWord也要做相同的更改:

     1 string Dictionary::getCurWord()
     2 {
     3     return getCurWord(_stackWord);
     4 }
     5 
     6 string Dictionary::getCurWord(list<WordIt> & stackWord)
     7 {
     8     string temp;
     9     list<WordIt>::iterator it_word;    
    10     it_word = stackWord.begin();    
    11 
    12     for(; it_word != stackWord.end(); ++it_word)
    13     {
    14         temp += (*it_word)->first;
    15     }
    16     return temp;
    17 }

      当然了,对外提供的接口都是不要传参的,其它的只能在内部使用,于是放入了private区。

      终于可以开始写联想了0.0

     1 bool Dictionary::associate(const string & word, vector<string> & data)
     2 {
     3     pDictElem root = _dictionary;
     4     vector<string> temp;
     5     splitWord(word, temp);
     6     
     7     int ret = search(temp, root);
     8     int size = temp.size();
     9     if(ret != size)
    10     {
    11         return false;
    12     }
    13     
    14     list<WordIt> stackWord;
    15     list<pDictElem> stackDict;
    16     next(root, stackWord, stackDict);
    17     while(root)
    18     {
    19         string temp = getCurWord(stackWord);
    20         data.push_back(temp);    
    21         next(root, stackWord, stackDict);
    22     }
    23     
    24     if(!data.size())
    25     {
    26         return false;
    27     }
    28     return true;
    29 }

      返回bool类型,可以方便地判断是否联想成功,即以传入的词做为前缀,能否找到剩余部分(词典里有存)。于是乎,一个渣渣型号的词典就做好啦~~~

    Dictionary.cc

      1 #include "Dictionary.h"
      2 #include <iostream>
      3 #include <fstream>
      4 #include <string>
      5 #include <json/json.h>
      6 
      7 namespace ccx{
      8 
      9 using std::endl;
     10 using std::cout;
     11 using std::pair;
     12 using std::ofstream;
     13 using std::ifstream;
     14 
     15 Dictionary::Dictionary()
     16 : _dictionary(new DictElem)
     17 , _conf()
     18 {
     19     _dictionary->_wordId = 0;
     20     _pcur = _dictionary;
     21 }
     22 
     23 void Dictionary::splitWord(const string & word, vector<string> & characters)
     24 {
     25     int num = word.size();
     26     int i = 0;
     27     while(i < num)
     28     {
     29         int size = 1;
     30         if(word[i] & 0x80)
     31         {
     32             char temp = word[i];
     33             temp <<= 1;
     34             do{
     35                 temp <<= 1;
     36                 ++size;
     37             }while(temp & 0x80);
     38         }
     39         string subWord;
     40         subWord = word.substr(i, size);
     41         characters.push_back(subWord);
     42         i += size;
     43     }
     44 }
     45 
     46 void Dictionary::AddWord(const string & word, int wordId)
     47 {
     48     vector<string> characters;
     49     splitWord(word, characters);
     50     
     51     vector<string>::iterator it_char;
     52     it_char = characters.begin();    
     53     pDictElem root;
     54     root = _dictionary;
     55     for(; it_char != characters.end(); ++it_char)
     56     {
     57         WordIt it_word;
     58         it_word = root->_words.find(*it_char);
     59         
     60         if(it_word == root->_words.end())
     61         {
     62             pair<string, pDictElem> temp;
     63             temp.first = *it_char;
     64             pDictElem dictemp(new DictElem);
     65             dictemp->_word = *it_char;
     66             dictemp->_wordId = 0;
     67             temp.second = dictemp;
     68             root->_words.insert(temp);
     69             root = dictemp;
     70         }else{
     71             root = it_word->second;
     72         }
     73     }
     74     if(!root->_wordId)
     75     {
     76         root->_wordId = wordId;
     77     }
     78 }
     79 
     80 void Dictionary::push(const string & word)
     81 {
     82     ++(_dictionary->_wordId);
     83     AddWord(word, _dictionary->_wordId);
     84 }
     85 
     86 void Dictionary::push(vector<string> & words)
     87 {
     88     int size = words.size();
     89     for(int i = 0; i < size; ++i)
     90     {
     91         push(words[i]);
     92     }
     93 }
     94 
     95 int Dictionary::search(const string & word)
     96 {
     97     pDictElem root = _dictionary;
     98     vector<string> temp;
     99     splitWord(word, temp);
    100     
    101     int ret = search(temp, root);
    102     int size = temp.size();
    103     if(ret != size)
    104     {
    105         return -1;
    106     }
    107     return root->_wordId;
    108 }
    109 
    110 int Dictionary::search(vector<string> & characters, pDictElem & root)
    111 {
    112     vector<string>::iterator it_char;
    113     it_char = characters.begin();    
    114     root = _dictionary;
    115     int i = 0;
    116     for(; it_char != characters.end(); ++it_char, ++i)
    117     {
    118         WordIt it_word;
    119         it_word = root->_words.find(*it_char);
    120         
    121         if(it_word == root->_words.end())
    122         {
    123             break;
    124         }else{
    125             root = it_word->second;
    126         }
    127     }
    128     return i;
    129 }
    130 
    131 bool Dictionary::associate(const string & word, vector<string> & data)
    132 {
    133     pDictElem root = _dictionary;
    134     vector<string> temp;
    135     splitWord(word, temp);
    136     
    137     int ret = search(temp, root);
    138     int size = temp.size();
    139     if(ret != size)
    140     {
    141         return false;
    142     }
    143     
    144     list<WordIt> stackWord;
    145     list<pDictElem> stackDict;
    146     next(root, stackWord, stackDict);
    147     while(root)
    148     {
    149         string temp = getCurWord(stackWord);
    150         data.push_back(temp);    
    151         next(root, stackWord, stackDict);
    152     }
    153     
    154     if(!data.size())
    155     {
    156         return false;
    157     }
    158     return true;
    159 }
    160 
    161 //遍历用
    162 
    163 void Dictionary::resetPoint(pDictElem pcur)
    164 {
    165     _pcur = pcur;
    166     if(_stackDict.size())
    167     {
    168         _stackDict.clear();
    169     }
    170     if(_stackWord.size())
    171     {
    172         _stackWord.clear();
    173     }
    174     next();
    175 }
    176 
    177 void Dictionary::resetIt()
    178 {
    179     resetPoint(_dictionary);
    180 }
    181 
    182 void Dictionary::next()
    183 {
    184     next(_pcur, _stackWord, _stackDict);
    185 }
    186 
    187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    188 {
    189     while(pcur)
    190     {
    191         nextWord(pcur, stackWord, stackDict);
    192         if(!pcur || pcur->_wordId)
    193         {
    194             break;
    195         }
    196     }
    197 }
    198 
    199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
    200 {
    201     if(pcur)
    202     {
    203         if(pcur->_words.size())
    204         {
    205             stackDict.push_back(pcur);
    206             stackWord.push_back(pcur->_words.begin());
    207             pcur = stackWord.back()->second;
    208         }else{
    209             ++(stackWord.back());
    210         }
    211         while(stackWord.back() == stackDict.back()->_words.end())
    212         {
    213             stackDict.pop_back();
    214             stackWord.pop_back();
    215             if(!stackDict.size())
    216             {
    217                 pcur = NULL;
    218             }
    219             ++(stackWord.back());
    220         }
    221         if(pcur)
    222         {
    223             pcur = stackWord.back()->second;
    224         }    
    225     }
    226 }
    227 
    228 string Dictionary::getCurChar()
    229 {
    230     return _pcur->_word;
    231 }
    232 
    233 int Dictionary::getCurWordId()
    234 {
    235     return _pcur->_wordId;
    236 }
    237 
    238 string Dictionary::getCurWord()
    239 {
    240     return getCurWord(_stackWord);
    241 }
    242 
    243 string Dictionary::getCurWord(list<WordIt> & stackWord)
    244 {
    245     string temp;
    246     list<WordIt>::iterator it_word;    
    247     it_word = stackWord.begin();    
    248 
    249     for(; it_word != stackWord.end(); ++it_word)
    250     {
    251         temp += (*it_word)->first;
    252     }
    253     return temp;
    254 }
    255 
    256 bool Dictionary::isEnd()
    257 {
    258     return _pcur == NULL;
    259 }
    260 
    261 void Dictionary::leading_in()//导入,失败没必要退出程序
    262 {
    263     ifstream ifs;
    264     const char * path = _conf.getDictionaryPath().c_str();
    265     ifs.open(path);
    266     if(!ifs.good())
    267     {
    268         cout << "open Dictionary.json error(leading_in)" << endl;
    269     }else{
    270         Json::Value root;
    271         Json::Reader reader;
    272         
    273         if(!reader.parse(ifs, root, false))
    274         {
    275             cout << "json read Dictionary.json error" << endl;
    276         }else{
    277             int size = root.size();
    278             for(int i = 0; i < size; ++i)
    279             {
    280                 string word = root[i]["Word"].asString();
    281                 int wordId = root[i]["WordId"].asInt();
    282                 AddWord(word, wordId);
    283                 ++(_dictionary->_wordId);
    284             }
    285         }
    286     }
    287 }
    288 
    289 void Dictionary::leading_out()
    290 {
    291     Json::Value root;
    292     Json::FastWriter writer;
    293 
    294     resetIt();
    295     
    296     while(!isEnd())
    297     {
    298         Json::Value elem;
    299         elem["Word"] = getCurWord();
    300         elem["WordId"] = getCurWordId();
    301         root.append(elem);
    302         next();
    303     }
    304 
    305     string words;
    306     words = writer.write(root);
    307     
    308     ofstream ofs;
    309     const char * path = _conf.getDictionaryPath().c_str();
    310     ofs.open(path);
    311     if(!ofs.good())
    312     {
    313         cout << "open Dictionary.json error(leading_out)" << endl;
    314         ofs.open("Dictionary.tmp");
    315         if(!ofs.good())
    316         {
    317             exit(EXIT_FAILURE);
    318         }
    319     }
    320     
    321     ofs << words;
    322     ofs.close();
    323 }
    324 
    325 }
    View Code
  • 相关阅读:
    VC 常见问题百问
    python windows 环境变量
    Check server headers and verify HTTP Status Codes
    Where are the AES 256bit cipher suites? Please someone help
    outlook 如何预订会议和会议室
    安装Axis2的eclipse插件后,未出现界面
    windows 环境变量
    python 时间日期处理汇集
    openldap学习笔记(使用openldap2.3.32)
    set p4 environment in windows
  • 原文地址:https://www.cnblogs.com/chinxi/p/6134053.html
Copyright © 2011-2022 走看看