zoukankan      html  css  js  c++  java
  • Trie实现(C++)

    参考了本文:http://www.cnblogs.com/xulb597/archive/2012/07/05/2578562.html

    • 支持模糊搜索,比如,【bkmh】可以匹配【BuKaManHua】;
    • 支持优先级,首字母、大小字母有更高的优先级。

    亟需解决的问题:

    • 目前搜索结果与关键词中字母的顺序无关,即【buk】可以匹配【BKManHua】
    • 若条目中存在重复的任意关键字,即使不包含其他关键字,仍然能匹配上来
    • 内存的占用:因为接触C++不久,内存管理一窍不通,一个1.65MB的文件(312964个单词),索引之后程序(VS2013编译,Release版本)的内存有68 860KB,

    trie.hpp

    #ifndef TRIE
    #define TRIE
    
    #include "LinkedList.hpp"
    #include "MatchInfo.hpp"
    
    #include <stdlib.h>
    #include <iostream>
    #include <string>
    
    #define BRANCH_SIZE 28
    #define START_CHAR 'a'
    
    #define INDEX(x) (x == '?'? 27 : (x == '*'? 26 : x - START_CHAR))
            
    class Trie
    {
    public:
        Trie()
        {
            rootNode = new Trie_Node(0);
            memset(nodeMap, NULL, sizeof(nodeMap));
            memset(indexList, NULL, sizeof(indexList));
        }
    
        ~Trie()
        {
            //delete rootNode;
        }
    
        void insert(const char *data, const int i)
        {
            bool flag_start = false,flag_capital = false;
            Trie_Node *location = rootNode;
            int pos = 0;
            while (*data)
            {
                char c = *data;
                // check wether it's capital and convert to lowwer case if so.
                if(c > 'A'-1 && c < 'Z'+1)
                {
                    flag_capital = true;
                    c += 32;
                }else{
                    flag_capital = false;
                }
                // map the char value to int which starts from 0
                int index = INDEX(c);
                // skip invalid chars
                if(index < 0)
                {
                    data++;
                    pos++;
                    continue;
                }
                // find next
                if(location->next[index] == NULL)
                {
                    location->next[index] = getNode(index);
                }
                location = location->next[index];
    
                // build MatchInfo and add it to the trie node's indexList
                MatchInfo *info = new MatchInfo();
                info->itemindex = i;
                info->position = pos;    // position of the char in the string
                info->priority = 1;
    
                // intial or capital char has a higher priority
                if(!flag_start)
                {
                    flag_start = true;
                    info->priority++;
                }
                if(flag_capital)
                    info->priority++;
                if(indexList[index] == NULL)
                    indexList[index] = new LinkedList<MatchInfo>();
                indexList[index]->add(info);
                data++;
                pos++;
            }
            // end character has a higher priority
            //location->indexList->getCurrent()->value->priority++;
        }
    
        /*int match(const char *data)
        {
            Trie_Node *location = rootNode;
            while (*data && location)
            {
                location = location->next[INDEX(*data)];
                data++;
            }
            return (location != NULL);
        }*/
    
        /*void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
        {
            predicateIndexMap(data, indexMap, indexMapLength);
            
            int index;
            Trie_Node *location = rootNode;
            while (*data && location)
            {
                index = INDEX(*data);
                location = location->next[INDEX(*data)];
                if(location != NULL)
                {
                    fillIndexArray(indexMap, index);
                }
                data++;
            }
        }*/
    
        void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
        {
            predicateIndexMap(data, indexMap, indexMapLength);
            
            int index;
            Trie_Node *location = nodeMap[INDEX(*data)];
            do
            {
                index = INDEX(*data);
                if(location != NULL)
                {
                    fillIndexArray(indexMap, index);
                }
                else
                {
                    break;
                }
                data++;
            } while ((*data) && (location = nodeMap[index]));
        }
    
        /*void print()
        {
            print(rootNode);
        }*/
    
    private:
        //
        // a list to record matche info of each char in indexed words.
        // it's for priority and fuzzy seaching.
        //
        LinkedList<MatchInfo>* indexList[BRANCH_SIZE];
    
        struct Trie_Node
        {
            //int index;
            Trie_Node *next[BRANCH_SIZE];
            
            Trie_Node(int _index)
            {
                //index = _index;
                memset(next, NULL, sizeof(next));
            };
            ~Trie_Node()
            {
                //delete indexList;
                for (int i = 0; i < BRANCH_SIZE; i++)
                {
                    if(next[i])
                        delete next[i];
                }
            }
        };
    
        Trie_Node *rootNode;
    
        //
        // a map to hold all created Trie_Node.
        //
        Trie_Node *nodeMap[BRANCH_SIZE];
    
        //
        // /*get a trie node from map.*/
        // return a new Trie_Node;
        // index: (char - 'a')
        //
        Trie_Node *getNode(int index)
        {
            //return new Trie_Node(index);
            Trie_Node *tempNode = nodeMap[index];
            if(tempNode == NULL)
            {
                tempNode = new Trie_Node(index);
                nodeMap[index] = tempNode;
            }
            return tempNode;
        }
    
        //
        // fill [indexMap] with priority of char at [index]
        //
        void fillIndexArray(int* indexMap, int index)
        {
            if(indexList[index] == NULL)
                indexList[index] = new LinkedList<MatchInfo>();
            LinkedList<MatchInfo> *list = indexList[index];
            Node<MatchInfo> *node = list->getRoot();
            while (node)
            {
                int itemIndex = node->value->itemindex;
                if(indexMap[itemIndex] != -1)
                    indexMap[itemIndex] += node->value->priority;
                node = node->next;
            }
        }
    
        //
        // keep moving node to next until it's itemindex in value has been changed.
        // node will set to NULL if reaches the end.
        //
        void moveToNextItemIndex(Node<MatchInfo> **node)
        {
            int index = (*node)->value->itemindex;
            if((*node)->next == NULL)
                (*node) = NULL;
            else
            {
                while ((*node)->value->itemindex == index)
                {
                    (*node)=(*node)->next;
                    if((*node) == NULL)
                        break;
                }
            }
        }
    
        //
        // predicate whether an index in indexMap is impossiable to be matched.
        // It will be set to -1 if so.
        //
        void predicateIndexMap(const char* keyword, int* indexMap, size_t indexMapLength)
        {
            int *indexesMatched = new int[indexMapLength];
            int keywordLength = strlen(keyword);
            unsigned int keywordRecords[BRANCH_SIZE];
            size_t size = indexMapLength * sizeof(int);
            memset(indexesMatched, 0, size);
            memset(indexMap, -1, size);
            LinkedList<MatchInfo> *list;
            Node<MatchInfo> *match_node;
            int charIndex, index = 0;
            while (*keyword)
            {
                charIndex = INDEX(*keyword);
                if(keywordRecords[charIndex] == 1)
                {
                    keyword++;
                    continue;
                }
                keywordRecords[charIndex] = 1;
                list = indexList[charIndex];
                if(list != NULL)
                {
                    match_node = list->getRoot();
                    while (match_node != NULL)
                    {
                        indexesMatched[match_node->value->itemindex]++;
                        match_node = match_node->next;
                        //moveToNextItemIndex(&match_node);
                    }
                }
                keyword++;
            }
            for (int i = 0; i < indexMapLength; i++)
            {
                if(indexesMatched[i] >= keywordLength)
                    indexMap[i] = 0;
            }
            delete indexesMatched;
        }
    
        /*void print(Trie_Node* node)
        {
            char c;
            for (int i = 0; i < BRANCH_SIZE; i++)
            {
                if(node->next[i] != NULL)
    {
                    c = node->index + 'a';
                    printf("%c-", c);
                    print(node->next[i]);
                }
            }
        }*/
    };
    #endif // TRIE
    View Code

    LinkedList.hpp

    #ifndef LINKEDLIST
    #define LINKEDLIST
    
    #include <stdlib.h>
    #include <iostream>
    
    template <class T>
    struct Node
    {
        T* value;
        int index;
        Node *next;
        Node(T* _value, int _index)
        {
            value = _value;
            index = _index;
        }
        ~Node()
        {
            delete value;
        }
    };
    
    template <class T>
    class LinkedList
    {
    public:
        int length;
        LinkedList()
        {
            length = 0;
            root = new Node<T>(NULL, 0);
            current = root;
        };
        ~LinkedList()
        {
            Node<MatchInfo> *node = root;
            Node<MatchInfo> *tmp;
            while (node)
            {
                tmp = node->next;
                delete node;
                node = tmp;
            }
        };
        void add(T *value)
        {
            if(length == 0)
            {
                root->value = value;
                root->index = 0;
            }
            else
            {
                current->next = new Node<T>(value, current->index + 1);
                current = current->next;
            }
            length++;
            current->next = NULL;
        };
        Node<T> getAt(int index)
        {
            Node<T> *node = root;
            while (node)
            {
                if(node->index == index)
                    return node;
                node = node->next;
            }
            return NULL;
        }
        Node<T> *getRoot()
        {
            return root;
        }
    
        Node<T> *getCurrent()
        {
            return current;
        }
    private:
        Node<T> *root,*current;
    };
    
    
    #endif // LINKEDLIST
    View Code

    MatchInfo.hpp

    #ifndef DEFINE_MatchInfo
    #define DEFINE_MatchInfo
    //
    // 字符的匹配信息
    //
    struct MatchInfo
    {
        // 所在条目的序号
        int itemindex;
        // 所在的位置
        int position;
        // 优先级
        int priority;
    };
    #endif
    View Code

    SortBiTree.hpp

    #ifndef DEFINE_SortBiTree
    #define DEFINE_SortBiTree
    
    #include <stdlib.h>
    #include <iostream>
    #include <string>
    
    template <class T>
    struct BTNode
    {
        int index;
        T value;
        BTNode<T> *left,*right;
        BTNode(int _i, T _v)
        {
            index = _i;
            value = _v;
            left = NULL;
            right = NULL;
        }
    };
    
    template <class T>
    class SortBiTree
    {
    public:
        SortBiTree()
        {
            root = NULL;
        }
        ~SortBiTree()
        {
        }
    
        void add(int index, T value)
        {
            BTNode<T> *node = root;
            if(root == NULL)
                root = new BTNode<T>(index, value);
            else
            {
                add_iter(root, index, value);
            }
        }
    
        BTNode<T> *getMaxNode()
        {
            return maxNode;
        }
    
        BTNode<T> *getRootNode()
        {
            return root;
        }
    private:
        BTNode<T> *root,*minNode,*maxNode;
        void add_iter(BTNode<T>* node, int index, int value)
        {
            if(index > node->index)
            {
                if(node->left != NULL)
                {
                    add_iter(node->left, index, value);
                }
                else
                {
                    node->left = new BTNode<T>(index, value);
                    maxNode = node->left;
                }
            }
            else
            {
                if(node->right != NULL)
                {
                    add_iter(node->right, index, value);
                }
                else
                {
                    node->right = new BTNode<T>(index, value);
                    minNode = node->right;
                }
            }
        }
    };
    #endif
    View Code

    Stopwatch.hpp

    #ifndef STOPWATCH_DEF
    #define STOPWATCH_DEF
    #include <ctime>
    
    class Stopwatch
    {
    public:
        Stopwatch(){};
        ~Stopwatch(){};
        static Stopwatch* StartNew()
        {
            Stopwatch* sw = new Stopwatch();
            sw->startTime = clock();
            return sw;
        }
    
        double Stop()
        {
            double t = (double)(clock() - startTime)/CLOCKS_PER_SEC;
            delete this;
            return t;
        }
    private:
        clock_t startTime;
    };
    #endif
    View Code

    main.cpp

    #include <stdlib.h>
    #include <iostream>
    #include <string>
    #include <fstream>
    #include <sys/stat.h>
    
    #include "Stopwatch.hpp"
    #include "trie.hpp"
    #include "SortBiTree.hpp"
    
    using namespace std;
    
    int displayMaxItems = 10;
    
    unsigned long get_file_size(const char *path)
    {
        unsigned long filesize = -1;    
        struct stat statbuff;
        if(stat(path, &statbuff) < 0){
            return filesize;
        }else{
            filesize = statbuff.st_size;
        }
        return filesize;
    }
    
    void readFromFile(const char* path, char** buff)
    {
        long length = get_file_size(path);    // 取得文件大小
        if(length == -1){
            cerr << "content file is invalid!" << endl;
            system("quit");
            return;
        }
        FILE *f = fopen(path, "r");
        *buff = new char[length];
        fread(*buff, sizeof(char), length, f);
        fclose(f);
    }
    
    void readInlineChars(char* source, char** buff)
    {
        *buff = new char[strlen(source)];
        strcpy(*buff, source);
    }
    
    //
    // 用于显示结果
    // TODO:使用迭代器
    //
    void inorder_traverse(BTNode<int> *node, char* words[], int* indexMap, int threshold) {
        if(displayMaxItems == 0)
            return;
        if (NULL != node->left) {
            inorder_traverse(node->left, words, indexMap, threshold);
        }
        int confidence = indexMap[node->value];
        if(confidence > threshold)
        {
            printf("%i	%i	%s
    ", node->value, confidence, words[node->value]);
            displayMaxItems--;
        }
        if (NULL != node->right) {
            inorder_traverse(node->right, words, indexMap, threshold);
        }
    }
    
    void main(char* argv){
        Trie t;
        SortBiTree<int> bt;
        char* cpy = new char[];
        //=====================================
        // 注意,大文件请自行修改,去掉words
        //=====================================
        char *words[256];
    
        printf("indexing...");
        //readFromFile("contents.txt", &cpy);
        readFromFile("app_list.txt", &cpy);
        //readInlineChars("PlayShangDian PhotoshopDesigner pho Pho BuKaManHua BKManHua", &cpy);
        //readInlineChars("abc acc caa acb aaa abb", &cpy);
        //readInlineChars("aac aaa", &cpy);
    
        char *tk = strtok(cpy, " ");
        
        size_t index = 0;
        Stopwatch *sw = Stopwatch::StartNew();
        t.insert(tk, index++);
        while (tk = strtok(NULL, " "))
        {
            words[index] = tk;
            t.insert(tk, index++);
        }
    
        printf("%i word(s) have been indexed. [%lf seconds]
    ", index, sw->Stop());
    label_enter:
        printf("I'm searching for:
    >");
    
        string input;
        const char *chars;
        while (true)
        {
            cin >> input;
            chars = input.data();
            if(!isalpha(*chars) || (*chars <= 'Z' && *chars >= 'A'))
            {
                printf("only lowwer character is accepted!
    >");
            }
            else
                break;
        }
        
        int *indexMap = new int[index];
        sw = Stopwatch::StartNew();
        printf("searching...");
        t.fuzzy_match(chars, indexMap, index);
        printf("done. [%lf seconds]
    ", sw->Stop());
    label_change:
        bt = SortBiTree<int>();
        printf("please input the threshold:
    >");
        int threshold = 0;
        cin >> threshold;
    
        printf("calculating...");
        int count = 0;
        for (int i = 0; i < index; i++)
        {
            int confidence = indexMap[i];
            if(confidence > threshold)
            {
                count++;
                bt.add(confidence, i);
            }
        }
        printf(" [%i] words matched.
    ", count);
        displayMaxItems = 10;
        if(count > displayMaxItems)
            printf("first 10 items are listed below.
    ");
        goto label_display;
    
    label_display:
        printf("------------------------------------------
    ");
        printf("index	priority	content
    ");
        printf("------------------------------------------
    ");
        if(bt.getRootNode() != NULL)
            inorder_traverse(bt.getRootNode(), words, indexMap, threshold);
        else
            printf("                none                      
    ");
        /*it = &bt.getIterator();
        while (it->hasNext())
        {
            int i = it->next();
            int confidence = indexMap[i];
            if(confidence > threshold)
            {
                printf("%s	%i	%i
    ",words[i], confidence, i);
            }
        }*/
        printf("------------------------------------------
    ");
    label_menu:
        printf("now you may want to : 
    [1].See them all.
    [2].Change thresgold
    [3].Change keyword.
    [q].Exit
    >");
        char choise = 0;
        cin >> choise;
        switch (choise)
        {
        case '1':
            displayMaxItems = -1;
            goto label_display;
            break;
        case '2':
            goto label_change;
            break;
        case '3':
            goto label_enter;
            break;
        case 'q':
            break;
        default:
            break;
        }
    
        //delete indexMap;
    }
    View Code

    截图:

  • 相关阅读:
    Blazor client-side + webapi (.net core 3.1) 添加jwt验证流程(非host)第一步
    移动端web框架候选
    Blazor初体验之寻找存储client-side jwt token的方法
    python学习记录(持续更新)--最最最基础的一部分(方法,异常处理,注释,类)
    python学习记录(持续更新)--最最最基础的一部分(条件分支、循环、数组矩阵、元组)
    python学习记录(持续更新)--最最最基础的一部分(变量、字符串、运算符、常见函数)
    C#后台异步消息队列实现
    Quartz.Net 删除一个Job
    授权认证(IdentityServer4)
    文件同步
  • 原文地址:https://www.cnblogs.com/ornithopter/p/3732496.html
Copyright © 2011-2022 走看看