zoukankan      html  css  js  c++  java
  • 基于COCA词频表的文本词汇分布测试工具v0.2

    update:

    • 简单整理了一下代码的组织。
    • 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。

    写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。

    项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

    TypeDefine.h

    #ifndef _TYPE_DEFINE_H_
    #define _TYPE_DEFINE_H_
    
    #include <iostream>
    #include <fstream>
    #include <string>
    #include <array>
    #include <vector>
    #include <iterator>
    #include <map>
    
    using namespace std;
    
    #define COCA_WORDS_NUM                       20201U
    #define WORDS_HEAD_NUM                       26U
                                                 
    #define WORDS_HEAD_A                         0U
    #define WORDS_HEAD_B                         1U
    #define WORDS_HEAD_C                         2U
    #define WORDS_HEAD_D                         3U
    #define WORDS_HEAD_E                         4U
    #define WORDS_HEAD_F                         5U
    #define WORDS_HEAD_G                         6U
    #define WORDS_HEAD_H                         7U
    #define WORDS_HEAD_I                         8U
    #define WORDS_HEAD_J                         9U
    #define WORDS_HEAD_K                         10U
    #define WORDS_HEAD_L                         11U
    #define WORDS_HEAD_M                         12U
    #define WORDS_HEAD_N                         13U
    #define WORDS_HEAD_O                         14U
    #define WORDS_HEAD_P                         15U
    #define WORDS_HEAD_Q                         16U
    #define WORDS_HEAD_R                         17U
    #define WORDS_HEAD_S                         18U
    #define WORDS_HEAD_T                         19U
    #define WORDS_HEAD_U                         20U
    #define WORDS_HEAD_V                         21U
    #define WORDS_HEAD_W                         22U
    #define WORDS_HEAD_X                         23U
    #define WORDS_HEAD_Y                         24U
    #define WORDS_HEAD_Z                         25U
                                                 
    #define USUAL_WORD_NUM                       17U
    
    
    typedef enum WordFrequencyType
    {
        WORD_UNDER_4000 = 0,
        WORD_4000_6000,
        WORD_6000_8000,
        WORD_8000_10000,
        WORD_10000_12000,
        WORD_12000_14000,
        WORD_14000_16000,
        WORD_OVER_16000,
        WORD_NOT_FOUND_COCA,
        WORD_LEVEL_NUM
    }TagWordFrequencyType;
    
    const string alphabet_str = "abcdefghijklmnopqrstuvwxyz";
    
    const string report_str[WORD_LEVEL_NUM] = {
        "UNDER 4000: ",
        "4000-6000: ",
        "6000-8000: ",
        "8000-10000: ",
        "10000-12000: ",
        "12000-14000: ",
        "14000-16000: ",
        "16000-20000+: ",
        "
    Not found in COCA:"
    };
    
    //for usual words not included in COCA
    const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
    {
        "s","is","are","re","was","were",
        "an","won","t","has","had","been",
        "did","does","cannot","got","men"
    };
    
    
    #endif

    TextVocabularyAnalyzer.h

    #ifndef _TEXT_VOCABULARY_ANALYZER_H_
    #define _TEXT_VOCABULARY_ANALYZER_H_
    
    #include "TypeDefine.h"
    
    extern TagWordFrequencyType frequency_classify(const int wfrq);
    extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
    extern bool isaletter(const char& c);
    
    class CLetters
    {
    private:
        string m_word;
    
    public:
        CLetters();
        ~CLetters();
        void fill(vector<char>& vw);
        const string word();
        const char firstletter();
        void processing();
        bool usual_recheck();
        bool form_recheck();
    };
    
    
    
    #endif // !_TEXT_VOCABULARY_ANALYZER_H_

    TextVocabularyAnalyzer.cpp

    /* TextVocabularyAnalyzer.cpp */
    
    #include <algorithm>
    #include "TextVocabularyAnalyzer.h"
    
    TagWordFrequencyType frequency_classify(const int wfrq)
    {
        if (wfrq == 0)
        {
            return WORD_NOT_FOUND_COCA;
        }
        else if (wfrq > 0 && wfrq <= 4000)
        {
            return WORD_UNDER_4000;
        }
        else if (wfrq > 4000 && wfrq <= 6000)
        {
            return WORD_4000_6000;
        }
        else if (wfrq > 6000 && wfrq <= 8000)
        {
            return WORD_6000_8000;
        }
        else if (wfrq > 8000 && wfrq <= 10000)
        {
            return WORD_8000_10000;
        }
        else if (wfrq > 10000 && wfrq <= 12000)
        {
            return WORD_10000_12000;
        }
        else if (wfrq > 12000 && wfrq <= 14000)
        {
            return WORD_12000_14000;
        }
        else if (wfrq > 14000 && wfrq <= 16000)
        {
            return WORD_14000_16000;
        }
        else
        {
            return WORD_OVER_16000;
        }
    }
    
    void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
    {
        switch (wfrq_tag)
        {
        case WORD_UNDER_4000:
        {
            wfrq_array[WORD_UNDER_4000] += 1;
            break;
        }
        case WORD_4000_6000:
        {
            wfrq_array[WORD_4000_6000] += 1;
            break;
        }
        case WORD_6000_8000:
        {
            wfrq_array[WORD_6000_8000] += 1;
            break;
        }
        case WORD_8000_10000:
        {
            wfrq_array[WORD_8000_10000] += 1;
            break;
        }
        case WORD_10000_12000:
        {
            wfrq_array[WORD_10000_12000] += 1;
            break;
        }
        case WORD_12000_14000:
        {
            wfrq_array[WORD_12000_14000] += 1;
            break;
        }
        case WORD_14000_16000:
        {
            wfrq_array[WORD_14000_16000] += 1;
            break;
        }
        case WORD_OVER_16000:
        {
            wfrq_array[WORD_OVER_16000] += 1;
            break;
        }
        default:
        {
            wfrq_array[WORD_NOT_FOUND_COCA] += 1;
            break;
        }
        }
    }
    
    bool isaletter(const char& c)
    {
        if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    
    
    //Class Cletters realization
    CLetters::CLetters()
    {
        m_word = "";
    }
    
    CLetters::~CLetters()
    {
        //do nothing
    }
    
    void CLetters::fill(vector<char>& vw)
    {
        //store the word with lower form
        m_word.assign(vw.begin(), vw.end());
        transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
    }
    
    const string CLetters::word()
    {
        return m_word;
    }
    
    const char CLetters::firstletter()
    {
        return m_word[0];
    }
    
    void CLetters::processing()
    {
        cout << "Finding word "" << m_word << ""...	";
    }
    
    
    bool CLetters::usual_recheck()
    {
        //check if the word is usual
        bool RetVal = false;
        for (int i = 0; i < USUAL_WORD_NUM; i++)
        {
            if (m_word == usual_w_out_of_COCA_str[i])
            {
                RetVal = true;
            }
            else
            {
                //do nothing
            }
        }
        return RetVal;
    }
    
    bool CLetters::form_recheck()
    {
        bool RetVal = false;
        if (m_word.length() > 3)
        {
            char e1, e2, e3;
            e3 = m_word[m_word.length() - 3];    //last but two letter
            e2 = m_word[m_word.length() - 2];    //last but one letter
            e1 = m_word[m_word.length() - 1];    //last letter
    
            if (e1 == 's')
            {
                m_word.erase(m_word.length() - 1);
                RetVal = true;
            }
            else if (e2 == 'e' && e1 == 'd')
            {
                m_word.erase(m_word.length() - 1);
                m_word.erase(m_word.length() - 1);
                RetVal = true;
            }
            else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
            {
                m_word.erase(m_word.length() - 1);
                m_word.erase(m_word.length() - 1);
                m_word.erase(m_word.length() - 1);
                RetVal = true;
            }
            else
            {
                //do nothing
            }
        }
        return RetVal;
    }

    main.cpp

    /* main .cpp */
    
    #include <numeric>
    #include <iomanip>
    #include <ctime>
    #include "TextVocabularyAnalyzer.h"
    
    int main()
    {
        //file init
        ifstream COCA_txt("D:\COCA.txt");
        ifstream USER_txt("D:\JobsSpeech.txt");
    
        //time init
        clock_t startTime, endTime;
        double build_map_time = 0;
        double process_time = 0;
    
        startTime = clock();    //build time start
    
        //build COCA words map
        map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
        int readlines = 0;
    
        while (readlines < COCA_WORDS_NUM)
        {
            int frequency = 0; string word = "";
            COCA_txt >> frequency;
            COCA_txt >> word;
    
            //transform to lower uniformly 
            transform(word.begin(), word.end(), word.begin(), tolower);
    
            //import every word
            for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
            {
                //check word head 
                if (word[0] == alphabet_str[whead])
                {
                    //if a word already exists, only load its lower frequency
                    if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
                    {
                        COCA_WordsList[whead].insert(make_pair(word, frequency));
                    }
                    else
                    {
                        COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
                    }
                }
                else
                {
                    // do nothing
                }
            }
            readlines++;
        }
    
        endTime = clock();    //build time stop
        build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;
    
        //user prompt
        cout << "COCA words list imported.
    Press any key to start frequency analysis...
    ";
        cin.get();
    
        startTime = clock();    //process time start
    
        //find text words
        vector<char> content_read;
        CLetters word_readed;
        vector<int> frequecy_processed = { 0 };
        array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
        char char_read = ' ';
    
        //get text char one by one
        while (USER_txt.get(char_read))
        {
            //only letters and '-' between letters will be received
            if (isaletter(char_read) || char_read == '-')
            {
                content_read.push_back(char_read);
            }
            else
            {
                //char which is not a letter marks the end of a word
                if (!content_read.empty())    //skip single letter 
                {
                    int current_word_frequency = 0;
    
                    //assign letters to make the word
                    word_readed.fill(content_read);
                    word_readed.processing();
    
                    cout << "Frequency:";
                    //check the word's head and find its frequency in COCA list
                    for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
                    {
                        if (word_readed.firstletter() == alphabet_str[whead])
                        {
                            cout << COCA_WordsList[whead][word_readed.word()];
                            current_word_frequency = COCA_WordsList[whead][word_readed.word()];
    
                            //check if the word has been processed
                            if (current_word_frequency == 0)
                            {
                                //addtional check
                                if (word_readed.usual_recheck())
                                {
                                    word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
                                }
                                else if (word_readed.form_recheck())
                                {
                                    current_word_frequency = COCA_WordsList[whead][word_readed.word()];    //try again
                                    if (current_word_frequency > 0)
                                    {
                                        frequecy_processed.push_back(current_word_frequency);
                                        word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                                    }
                                    else
                                    {
                                        // do nothing
                                    }
                                }
                                else
                                {
                                    word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
                                }
                            }
                            else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
                                == frequecy_processed.end())
                            {
                                //classify this word and make statistics
                                frequecy_processed.push_back(current_word_frequency);
                                word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                            }
                            else
                            {
                                // do nothing
                            }
                        }
                        else
                        {
                            //do nothing
                        }
                    }
                    cout << endl;
    
                    content_read.clear();
                }
                else
                {
                    //do nothing
                }
            }
        }
    
        endTime = clock();    //process time stop
        process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;
    
        //calc whole words processed
        int whole_words_analyzed = 0;
        whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0);
    
        //report result
        cout << "
    ////////// Report ////////// 
    ";
        for (int i = 0;i< words_analysis_array.size();i++)
        {
            cout << report_str[i] <<"	"<< words_analysis_array[i] << " (";
            cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
        }
        cout << "
    Words totally analyzed: " << whole_words_analyzed << endl;
    
        //show run time
        cout << "Map build time: " << build_map_time*1000 << "ms.
    ";
        cout << "Process time: " << process_time*1000 << "ms.
    ";
        cout << "////////////////////////////" << endl;
    
        //close file
        COCA_txt.close();
        USER_txt.close();
    
        return 0;
    }
  • 相关阅读:
    php 数据类型
    Django REST framework基础:视图和路由
    Django REST framework基础:序列化
    android 适配器 ArrayAdapter,SimpleAdapter的学习
    关于系统模块设计的一点疑问?
    看了看 #ifndef 和#pragma once 的区别
    ace.js 中文手册
    .net core 3.1发布时视图Views文件夹不被打打包成.dll文件解决办法
    Asp.net core应用在 Kubernetes上内存使用率过高问题分析
    .Net Core内存回收模式及性能测试对比
  • 原文地址:https://www.cnblogs.com/banmei-brandy/p/13235125.html
Copyright © 2011-2022 走看看