zoukankan      html  css  js  c++  java
  • [算法 笔记] 统计给定文章的单词个数

      问题出自blog:http://blog.csdn.net/v_july_v/article/details/6803368

      问题:高效统计一篇英文文章里出现的所有单词,按照在文章中首次出现的顺序打印出该单词和它出现的次数。

      解决方式:利用Trie完成单词匹配,然后利用链表来统计单词出现的个数。

      源代码:

      

      1 #include <stdio.h>
      2 #include <stdlib.h>     // for calloc(), free()
      3 #include <string.h>     // for strlen(), memset()
      4 
      5 enum { BranchSize = 26, StringSize = 40, NodeMax = 200 }; // 声明常量
      6 
      7 /* 链表信息以及相关操作函数 */
      8 struct ListNode
      9 {
     10     int     m_iCnt;             // 统计单词出现的次数
     11     char    m_szStr[StringSize];// 当前链表项中单词
     12     struct ListNode *m_pNxt;    // 指向下一个链表结点
     13 };
     14 
     15 struct ListHead
     16 {
     17     struct ListNode *m_pStart;  // 指向链表开始结点
     18     struct ListNode *m_pEnd;    // 指向链表末尾结点
     19 };
     20 
     21 typedef struct ListNode ListNode;
     22 typedef struct ListHead ListHead;
     23 
     24 // 分配头结点
     25 ListHead* AllocListHead()
     26 {
     27     ListHead *pNew = NULL;
     28 
     29     pNew = (ListHead *) calloc( 1, sizeof( ListHead ) );
     30     if ( NULL == pNew )
     31     {
     32         printf( "Out of Memory.
    " );
     33         return NULL;
     34     }
     35 
     36     pNew->m_pEnd        = NULL;
     37     pNew->m_pStart      = NULL;
     38 
     39     return pNew;
     40 }
     41 
     42 // 分配链表结点
     43 ListNode* AllocListNode( const char *szStr )
     44 {
     45     ListNode *pNew = NULL;
     46 
     47     pNew = (ListNode *)calloc( 1, sizeof(ListNode) );
     48     if ( NULL == pNew )
     49     {
     50         printf( "Out of Memory.
    " );
     51         return NULL;
     52     }
     53 
     54     // 初始化信息
     55     pNew->m_iCnt = 1;
     56     pNew->m_pNxt = NULL;
     57     strncpy( pNew->m_szStr, szStr, strlen( szStr ) );
     58     pNew->m_szStr[strlen(szStr)] = '';
     59 
     60     return pNew;
     61 }
     62 
     63 // 插入链表
     64 int InsertNodeIntoList( ListHead *pHead, const char *szStr )
     65 {
     66     ListNode *pStart = pHead->m_pStart,
     67               *pNew   = NULL;
     68 
     69     // 检查参数
     70     if ( NULL == szStr )
     71     {
     72         printf( "The string is null.
    " );
     73         return -1;
     74     }
     75 
     76     // 分配新节点
     77     pNew = AllocListNode( szStr );
     78     if ( NULL == pNew )
     79     {
     80         return -1;
     81     }
     82 
     83     // 将结点插入链表尾部
     84     if ( pStart != NULL )
     85     {
     86         pHead->m_pEnd->m_pNxt = pNew;
     87         pHead->m_pEnd = pNew;
     88     }
     89     else
     90     {
     91         pHead->m_pStart = pNew;
     92         pHead->m_pEnd = pNew;
     93     }
     94 
     95     return 1;
     96 }
     97 
     98 // 摧毁链表
     99 void DestoryList( ListHead **pHead )
    100 {
    101     ListNode *pStart = (*pHead)->m_pStart,
    102               *pFree  = NULL;
    103 
    104     if ( NULL == pHead )
    105         return;
    106 
    107     while ( pStart )
    108     {
    109         pFree = pStart;
    110         pStart= pStart->m_pNxt;
    111         free( pFree );
    112         pFree = NULL;
    113     }
    114 
    115     free( *pHead );
    116     *pHead = NULL;
    117 }
    118 
    119 // 输出链表中信息
    120 void OutputList( ListHead *pHead )
    121 {
    122     ListNode *pStart = pHead->m_pStart;
    123     int sum = 0;
    124 
    125     printf( "About statistic:
    ");
    126     while ( pStart )
    127     {
    128         sum += pStart->m_iCnt;
    129         printf( "%s:		%d
    ", pStart->m_szStr, pStart->m_iCnt );
    130         pStart = pStart->m_pNxt;
    131     }
    132     printf( "The total words is %d.
    ", sum );
    133     printf( "
    " );
    134 }
    135 
    136 /* Trie树结构体以及相关操作函数 */
    137 struct TrieNode
    138 {
    139     int     m_iIsStr;       // 记录此处是否构成一个字符串。
    140     struct TrieNode *m_pBranch[BranchSize]; // 指向各个子树的指针,小标0-25代表26个字符
    141     struct ListNode *m_pCountInfo;  // 指向该单词的统计信息结点
    142 };
    143 
    144 typedef struct TrieNode TrieNode;
    145 
    146 // 分配Trie树的新节点
    147 TrieNode* AllocTrieNode()
    148 {
    149     TrieNode *pNew = NULL;
    150     int idx = 0;
    151 
    152     pNew = (TrieNode *) calloc( 1, sizeof( TrieNode ) );
    153     if ( NULL == pNew )
    154     {
    155         printf( "Out of memory.
    " );
    156         return NULL;
    157     }
    158 
    159     // initialize information.
    160     for ( ; idx < BranchSize; ++idx )
    161         pNew->m_pBranch[idx] = NULL;
    162     pNew->m_pCountInfo  = NULL;
    163     pNew->m_iIsStr      = 0;
    164 
    165     return pNew;
    166 }
    167 
    168 // 在Trie树中查找单词
    169 int SearchNodeInTrie( TrieNode *pRoot,
    170                       const char *word )
    171 {
    172     TrieNode *pStart = pRoot;
    173 
    174     while ( *word && pStart )
    175     {
    176         pStart = pStart->m_pBranch[*word - 'a'];
    177         ++word;
    178     }
    179 
    180     // 在Trie树中找到szStr,则更新结点信息。
    181     if ( pStart != NULL && pStart->m_iIsStr )
    182     {
    183         pStart->m_pCountInfo->m_iCnt++;
    184         return 1;
    185     }
    186 
    187     return 0;
    188 }
    189 
    190 // 插入单词到Trie树中
    191 int InsertNodeIntoTrie( TrieNode *pRoot,
    192                         ListHead *pStart,
    193                         const char *szStr )
    194 {
    195     TrieNode *location  = pRoot;
    196     const char *word    = szStr;
    197 
    198     if ( SearchNodeInTrie( pRoot, szStr) == 1 )
    199         return 0;
    200 
    201     while ( *szStr )
    202     {
    203         if ( location->m_pBranch[*szStr - 'a'] == NULL ) // 不存在
    204         {
    205             TrieNode *pNew = AllocTrieNode();
    206             if ( NULL == pNew )
    207                 return -1;
    208             location->m_pBranch[*szStr - 'a'] = pNew;
    209         }
    210         // 每插入一步,相当于一个新串经过,指针要向下移动
    211         location = location->m_pBranch[*szStr - 'a'];
    212         ++szStr;
    213     }
    214     location->m_iIsStr = 1;
    215     if ( InsertNodeIntoList( pStart, word ) == 1 )
    216     {
    217         location->m_pCountInfo = pStart->m_pEnd;
    218         return 1;
    219     }
    220 
    221     return 0;
    222 }
    223 
    224 // 摧毁Trie树
    225 void DestoryTrie( TrieNode **pRoot )
    226 {
    227     TrieNode *TrieStack[NodeMax],
    228              *pNxt  = NULL,
    229              *root  = *pRoot;
    230     int top     = 0,
    231         idx     = 0;
    232 
    233     // Initialize stack
    234     for ( ; idx < NodeMax; ++idx )
    235         TrieStack[idx] = NULL;
    236 
    237     for ( idx = 0; idx < BranchSize; ++idx )
    238     {
    239         if ( root->m_pBranch[idx] != NULL )
    240             TrieStack[top++] = root->m_pBranch[idx];
    241     }
    242 
    243     // 遍历Trie树,并删除
    244     while ( top )
    245     {
    246         pNxt = TrieStack[--top];
    247 
    248         for ( idx = 0; idx < BranchSize; ++idx )
    249         {
    250             if ( pNxt->m_pBranch[idx] != NULL )
    251                 TrieStack[top++] = pNxt->m_pBranch[idx];
    252         }
    253 
    254         free( pNxt );
    255         pNxt = NULL;
    256     }
    257 
    258     free( *pRoot );
    259     *pRoot = NULL;
    260 }
    261 
    262 void TestFunction()
    263 {
    264     const char *pszStrs[9] =
    265         {
    266             "hello", "word", "hi",
    267             "hello", "hello","hi",
    268             "word", "word", "word"
    269         };
    270     int idx = 0;
    271     TrieNode *pTrie = NULL;
    272     ListHead *pList = NULL;
    273 
    274     pTrie = AllocTrieNode();
    275     if ( NULL == pTrie )
    276         return;
    277     pList = AllocListHead();
    278     if ( NULL == pList )
    279     {
    280         DestoryTrie( &pTrie );
    281         return;
    282     }
    283 
    284     for ( idx = 0; idx < 9; ++idx )
    285     {
    286         InsertNodeIntoTrie( pTrie, pList, pszStrs[idx] );
    287     }
    288 
    289     OutputList( pList );
    290 
    291     DestoryTrie( &pTrie );
    292     DestoryList( &pList );
    293 }
    294 
    295 int main()
    296 {
    297     TestFunction();
    298 
    299     return 0;
    300 }
    View Code

      Trie树源码参考blog:http://www.cnblogs.com/cherish_yimi/archive/2009/10/12/1581666.html

  • 相关阅读:
    Educational Codeforces Round 11——A. Co-prime Array(map+vector)
    ACM程序设计选修课——Problem D: (ds:树)合并果子(最优二叉树赫夫曼算法)
    ACM程序设计选修课——1076汇编语言(重定向+模拟)
    NOJ——1672剪绳子(博弈)
    廖雪峰Java8JUnit单元测试-1JUnit简介-1JUnit测试
    廖雪峰Java7处理日期和时间-4最佳实践-最佳实践
    廖雪峰Java7处理日期和时间-3java.time的API-2ZonedDateTime
    廖雪峰Java7处理日期和时间-3java.time的API-1LocalDateTime
    廖雪峰Java7处理日期和时间-2Data和Calendar-2Calendar
    廖雪峰Java7处理日期和时间-2Data和Calendar-1Date
  • 原文地址:https://www.cnblogs.com/life91/p/3272080.html
Copyright © 2011-2022 走看看