zoukankan      html  css  js  c++  java
  • 1st 英文文章词频统计

    英文文章词频统计:

    功能:统计一篇英文文章的单词总数及出现频数并输出,之后排序,输出频数前十的单词及其频数。

    实现方法:使用C语言,用fopen函数读入txt文件,fscanf函数逐个读入单词,结构体wordNode存储单词及其频数,以链表的形式连接在一起,最后使用插入排序进行分析,输出频数最高的5个单词。

     头文件

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>

     定义宏

    #define ERROR 1
    #define OK 0
    #define WORD_LENGTH 250

     自定义数据类型

    typedef int status;
    
    typedef struct Node
    {
        char word[WORD_LENGTH];
        int time;
        struct Node *next;
    }wordNode;

     定义全局变量

    wordNode *headNode = NULL;

     声明所有使用的函数

    wordNode *wordSearch(char *word,int *num);
    status wordCount(char *word,int *num);
    void printCountList(int *num);
    void PrintFirstFiveTimes();
    void mergeSort(wordNode **head);
    void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next);
    void wordJob(char word[]);
    wordNode *SortedMerge(wordNode *pre,wordNode *next);
    void release();

     主函数

    status main(int argc,char *argv[])
    {
        char temp[WORD_LENGTH];//定义用以临时存放单词的数组
        FILE *file;
        int count;
        int articleWordNum = 0;//定义统计结点个数的变量
        int *num = &articleWordNum;
        if((file = fopen("F:\zc\c\yjs\file.txt", "r")) == NULL)
        {
            printf("文件读取失败!");
            exit(1);
        }
        while((fscanf(file,"%s",temp))!= EOF)
        {
            wordJob(temp);
            count = wordCount(temp,num);
        }
        fclose(file);
        printf("
    输出所有单词的频数
    ");
        printCountList(num);
        printf("
    输出词频最高的5个词
    ");
        mergeSort(&headNode);              //排序
        PrintFirstFiveTimes();
        release();
        return 0;
    }

    查找单词所在结点并返回其地址

    wordNode *wordSearch(char *word,int *num)
    {
        wordNode *node;
        wordNode *nextNode = headNode;
        wordNode *preNode = NULL;
        char a[WORD_LENGTH];
        if(headNode == NULL)
        {
            node = (wordNode*)malloc(sizeof(wordNode));
            strcpy(node->word, word);
            node->time = 0;
            *num+=1;
            headNode = node;
            return node;
        }
        while(nextNode != NULL)          //查找匹配单词
        {
            strcpy(a,nextNode->word);
            if(strcmp(a, word) == 0)
            {
                return nextNode;
            }
            preNode = nextNode;
            nextNode = nextNode->next;
        }
    
        if(nextNode == NULL)
        {
            node = (wordNode*)malloc(sizeof(wordNode));
            strcpy(node->word, word);
            node->time = 0;
            node->next = headNode->next;
            headNode->next = node;
            *num+=1;
            return node;
        }
        else
            return nextNode;
    }

    进行词频统计

    status wordCount(char *word,int *num)
    {
        wordNode *tmpNode = NULL;
        tmpNode = wordSearch(word,num);      //word所在的节点
        if(tmpNode == NULL)
        {
            return ERROR;
        }
        tmpNode->time++;
        return 0;
    }

    输出所有词频

    void printCountList(int *num)
    {
        if(headNode == NULL)
        {
            printf("该文件无内容!");
        }
        else
        {
            wordNode *preNode = headNode;
            printf("
    	总计 %d 
    ",*num);
            while(preNode != NULL)
            {
                printf("
    	%s:%d次
    ",preNode->word,preNode->time);
                preNode = preNode->next;
            }
        }
    }

    输出词频最高的10个词

    void PrintFirstFiveTimes()
    {
        if(headNode == NULL)
        {
            printf("该文件无内容!");
        }
        else
        {
            wordNode *preNode = headNode;
            int i = 1;
            while (preNode != NULL && i<=5)
            {
                printf("
    	%s:%d次
    ",preNode->word,preNode->time);
                preNode = preNode->next;
                i++;
            }
        }
    }

    对词频统计结果进行归并排序

    void mergeSort(wordNode **headnode)
    {
        wordNode *pre,*next,*head;
        head = *headnode;
        if(head == NULL || head->next == NULL)
        {
            return;
        }
        FrontBackSplit(head,&pre,&next);
        mergeSort(&pre);
        mergeSort(&next);
        *headnode = SortedMerge(pre,next); 
    }

    取尾节点

    void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next)
    {
        wordNode *fast;
        wordNode *slow;
        if(source == NULL || source->next == NULL)
        {
            *pre = source;
            *next = NULL;
        }
        else
        {
            slow = source;
            fast = source->next;
            while(fast != NULL)
            {
                fast = fast->next;
                if(fast != NULL)
                {
                    slow = slow->next;
                    fast = fast->next;
                }
            }
            *pre = source;
            *next = slow->next;
            slow->next = NULL;
        }
    }

    取频数最大的节点作为头节点

    wordNode *SortedMerge(wordNode *pre,wordNode *next)
    {
        wordNode *result = NULL;
        if(pre == NULL)
            return next;
        else if(next == NULL)
            return pre;
        if(pre->time >= next->time)
        {
            result = pre;
            result->next = SortedMerge(pre->next,next);
        }
        else
        {
            result = next;
            result->next = SortedMerge(pre,next->next);
        }
        return result;
    }

    处理单词

    void wordJob(char word[])
    {
        int i,k;
        for(i = 0;i<strlen(word);i++)
        {
            if(word[i]>='A'&& word[i]<='Z')
            {
                word[i] += 32;
                continue;
            }
            if(word[i]<'a'||word[i]>'z')
            {
                if(i == (strlen(word)-1))
                {
                    word[i] = '';
                }
                else
                {
                    k = i;
                    while(i < strlen(word))
                    {
                        word[i] = word[i+1];
                        i++;
                    }
                    i = k;
                }
            }
        }
    }

    释放所有结点内存

    void release()
    {
        if(headNode == NULL)
            return;
        wordNode *pre = headNode;
        while(pre != NULL)
        {
            headNode = pre->next;
            free(pre);
            pre = headNode;
        }
    }

    git@git.coding.net:amberpass/Calculate_words.git

    https://git.coding.net/amberpass/Calculate_words.git

    程序运行结果

  • 相关阅读:
    Java生鲜电商平台-服务化后的互联网架构实战(针对生鲜电商小程序或者APP)
    你有一份新的C++书单,请注意查收!
    想了解大数据的鼻祖Hadoop技术栈,这里有一份优质书单推荐!
    程序员的健康问题终于有救了,有个资深程序员写了本《程序员健康指南》!
    《程序员如何优雅地挣零花钱》电子书免费开源!!!
    学习设计模式,你需要这样一份书单!
    《自拍教程36》段位三 Python面向对象类
    《自拍教程35》段位二 Python面向过程函数
    《自拍教程34》段位一 Python批处理脚本
    《自拍教程33》案例篇 内容介绍
  • 原文地址:https://www.cnblogs.com/landscape/p/5845852.html
Copyright © 2011-2022 走看看