zoukankan      html  css  js  c++  java
  • TF-IDF算法确定阅读主题词解答英语阅读Title题目

         对文章best title的选项进行打分

    #include <windows.h>
    #include <math.h>
    #include <time.h>
    #include <stdlib.h>
    #include <iostream>
    
    using namespace std;
    #define N 5269        //文献数目  
    #define textN 10        //题目数目
    #define ERROR 1
    #define OK 0
    const int WORD_LENGTH = 30;//定义单个单词最大长度
    char temp[WORD_LENGTH];//定义用以临时存放单词的数组
    
    typedef struct Node {
        char word[WORD_LENGTH] = { '' };
        int time = 0;
        int textnum = 0;
        double weight = 0;
    }wordNode, wordLink;
    char Libword[900][WORD_LENGTH] = { 0 };            //900条停用词库
    int wordleng = 0;    //词库中实际词条数目
    wordNode sumWord[1000];//文章词表
    int wordNum = 0;//文章中的非重单词数
    int sumWordNum = 0;//文章总词数
    double score[4] = { 0 };//选项分数
    int DoLibStop(char *name, char memory[][WORD_LENGTH])
    {
        FILE *cp = fopen(name, "r");//词库位置
        char ch;
        while (!feof(cp))                         //读取词库
        {
            ch = fgetc(cp);
            for (int i = 0; ch != 13 && i<22 && ch != 10; i++)//回车区分词
            {
                Libword[wordleng][i] = ch;
                ch = fgetc(cp);
            }
            //     std::cout<<(word[wordleng]);         //屏幕输出。临时
            wordleng++;
        }
        fclose(cp);    //关闭停用词库
        return wordleng;
    }
    void wordDelSpe(char word[]) //去掉特殊字符
    {
        int i, k, j;
        char *specialChar = ",.;:'“”?!><+=|*&^%$#@"[](){}0123456789";//定义特殊字符集
        for (i = 0; i<strlen(word); i++)
        {
            //筛选并去除字符串中的特殊字符
            for (k = 0; k<strlen(specialChar); k++)
            {
                if (word[i] == specialChar[k])
                {
                    j = i;
                    while (j<strlen(word))
                    {
                        word[j] = word[j + 1];
                        j++;
                    }
                    i--;
                    break;
                }
            }
        }
    }
    bool wordCmpStop(char *word)//将人称代词及其他常用词去掉
    {
        int simNum = wordleng;
        for (int i = 0; i<strlen(word); i++)//筛选并将字符串中的大写字母转化为小写字母
            if (word[i] >= 'A'&& word[i] <= 'Z')
                word[i] += 32;
        for (int i = 0; i<simNum; i++)
            if (strcmp(word, Libword[i]) == 0)
                return true;
        return false;
    }
    void wordSearch(char *word, int &wordnum) {
        int i = 0;
        while (i < wordnum && (strcmp(sumWord[i].word, word) != 0))
        {
            i++;
        }
        if (i < wordnum)
            sumWord[i].time++;
        if (i == wordnum)
        {
            strcpy(sumWord[i].word, word);
            wordnum++;
            sumWord[i].time = 1;
        }
        sumWordNum += 1;
    }
    //void wordSearch(char *word, int &wordnum) {
    //    int i = 0;
    //    while (i < wordnum && (strcmp(sumWord[i].word, word) != 0) && (!strstr(sumWord[i].word, word) || !strstr(word, sumWord[i].word)))
    //    {
    //        i++;
    //    }
    //    if (i < wordnum)
    //    {
    //        if (!strcmp(sumWord[i].word, word) || strstr(sumWord[i].word, word))
    //            sumWord[i].time++;
    //        else
    //        {
    //            strcpy(sumWord[i].word, word);
    //            sumWord[i].time++;
    //        }
    //    }
    //
    //    if (i == wordnum)
    //    {
    //        strcpy(sumWord[i].word, word);
    //        wordnum++;
    //        sumWord[i].time = 1;
    //    }
    //    sumWordNum += 1;
    //}
    void doArticle(char *file0)
    {
        FILE *file;
        if ((file = fopen(file0, "r")) == NULL) {
            //这里是绝对路径,基于XCode编译器查找方便的需求
            printf("%s文件读取失败!", file0);
            system("pause");
            exit(1);
        }
        while ((fscanf(file, "%s", temp)) != EOF)
        {
            if (temp[0] == '*')//遇到题目了
                break;
            wordDelSpe(temp);
            if (wordCmpStop(temp) == true)
            {
                sumWordNum += 1;
                continue;
            }
            wordSearch(temp, wordNum);
        }
        fclose(file);//关闭文件
    }
    void copyNode(wordNode& node1, wordNode &node2)//node2复制到node1
    {
        strcpy(node1.word, node2.word);
        node1.time = node2.time;
        node1.textnum = node2.textnum;
        node1.weight = node2.weight;
    }
    void sortWord()//直接插入排序
    {
        wordNode t;
        int i, j;
        /*cout << wordNum << endl;*/
        for (i = 1; i < wordNum; i++)
        {
            copyNode(t, sumWord[i]);
            for (j = i - 1; j >= 0 && sumWord[j].weight<t.weight; j--)
            {
                copyNode(sumWord[j + 1], sumWord[j]);
            }
            copyNode(sumWord[j + 1], t);
        }
    }
    void fileCount(char file[N][50])
    {
        int i, j;
        FILE *f;
        for (i = 0; i <N; i++)
        {
            f = fopen(file[i], "r");
            if (!f)
            {
                printf("%s文件读取失败!", file[i]);
                /*system("pause");
                exit(1);*/
                continue;
            }
            while ((fscanf(f, "%s", temp)) != EOF)
            {
                wordDelSpe(temp);
                j = 0;
                while (j < wordNum && (strcmp(sumWord[j].word, temp) != 0))
                {
                    j++;
                }
                if (j < wordNum)
                    sumWord[j].textnum++;//文章数++
            }
            fclose(f);//关闭文件
        }
    }
    void calWeight(wordNode *sumWord, int wordNum)
    {
        int i;
        for (i = 0; i < wordNum; i++)
            sumWord[i].weight = (sumWord[i].time * 1.0 / sumWordNum)*log((N*1.0) / (sumWord[i].textnum + 1));
    }
    int numWei(int n)
    {
        if (n / 10 == 0)
            return 1;
        else if (n / 100 == 0)
            return 2;
        else if (n / 1000 == 0)
            return 3;
        else
            return 4;
    }
    void fileNameMake(char file[][50], int n)
    {
        int i = 0, j = 0, i1, num;
        /*strcpy(file[0], "txt\txt1.txt");
        strcpy(file[1], "txt\txt2.txt");
        strcpy(file[2], "txt\txt3.txt");
        strcpy(file[3], "txt\txt4.txt");
        strcpy(file[4], "txt\txt5.txt");*/
        for (i = 0; i < n; i++)
        {
            strcpy(file[i], "fileLib\\txt");
            num = numWei(i + 1);
            //cout << num << endl;
            i1 = i + 1;
            for (j = num - 1; j >= 0; j--)
            {
                *(file[i] + 12 + j) = i1 % 10 + 48;
                i1 = i1 / 10;
            }
            strcpy(file[i] + 12 + num, ".txt");
            /*cout << file[i] << endl;*/
            /*if (strcmp(file[i] , "txt\\txt00.txt")==0)
            cout << i << endl;*/
        }
    }
    void saveNameMake(char savefile[][50], int n)
    {
        int i = 0, j = 0, i1, num;
        for (i = 0; i < n; i++)
        {
            strcpy(savefile[i], "savefile\\save");
            num = numWei(i + 1);
            //cout << num << endl;
            i1 = i + 1;
            for (j = num - 1; j >= 0; j--)
            {
                *(savefile[i] + 14 + j) = i1 % 10 + 48;
                i1 = i1 / 10;
            }
            strcpy(savefile[i] + 14 + num, ".txt");
            //cout << savefile[i] << endl;
            /*if (strcmp(file[i] , "txt\\txt00.txt")==0)
            cout << i << endl;*/
        }
    }
    void textNameMake(char textfile[][50], int n)
    {
        int i = 0, j = 0, i1, num;
        for (i = 0; i < n; i++)
        {
            strcpy(textfile[i], "textfile\\text");
            num = numWei(i + 1);
            //cout << num << endl;
            i1 = i + 1;
            for (j = num - 1; j >= 0; j--)
            {
                *(textfile[i] + 14 + j) = i1 % 10 + 48;
                i1 = i1 / 10;
            }
            strcpy(textfile[i] + 14 + num, ".txt");
            //cout << textfile[i] << endl;
            /*if (strcmp(file[i] , "txt\\txt00.txt")==0)
            cout << i << endl;*/
        }
    }
    void scoreArticle(char *file0,int k,char *answers, int &answerNum)
    {
        FILE *file;
        int i;
        if ((file = fopen(file0, "r")) == NULL) {
            //这里是绝对路径,基于XCode编译器查找方便的需求
            printf("%s文件读取失败!", file0);
            system("pause");
            exit(1);
        }
        int count = -1;
        while ((fscanf(file, "%s", temp)) != EOF&&temp[0] != '*') {}//※号提示题目
        while ((fscanf(file, "%s", temp)) != EOF)
        {
            if (temp[0] == '*')
                break;
            if (!strcmp(temp, "A.") || !strcmp(temp, "B.") || !strcmp(temp, "C.") || !strcmp(temp, "D."))
            {
                count++;
                continue;
            }
            wordDelSpe(temp);
            for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
                if (temp[i] >= 'A'&& temp[i] <= 'Z')
                    temp[i] += 32;
            for (i = 0; i < wordNum; i++)
            {
                if (!strcmp(temp, sumWord[i].word))
                    score[count] += sumWord[i].weight;
            }
        }
        fscanf(file, "%s", temp);
        /*cout << temp[1] << "###"<<endl;*/
        answers[k] = temp[1];
        answerNum++;
        fclose(file);//关闭文件
    }
    void doArticleLocal(char *file0)
    {
        FILE *file;
        int i;
        if ((file = fopen(file0, "r")) == NULL) {
            //这里是绝对路径,基于XCode编译器查找方便的需求
            printf("%s文件读取失败!",file0);
            system("pause");
            exit(1);
        }
        while ((fscanf(file, "%s", temp)) != EOF)
        {
            if (temp[strlen(temp) - 1] == '#')
            {
                /*cout << "遇到了#" << endl;*/
                break;
            }
            wordDelSpe(temp);
            for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
                if (temp[i] >= 'A'&& temp[i] <= 'Z')
                    temp[i] += 32;
            for (i = 0; i < wordNum; i++)
            {
                if (!strcmp(temp, sumWord[i].word))
                {
                    sumWord[i].weight *= 1.5;
                    /*cout << "改了" << endl;*/
                }
            }
        }
        while ((fscanf(file, "%s", temp)) != EOF&&temp[0] != '#') {}//再次遇到#号,最后一段
        while ((fscanf(file, "%s", temp)) != EOF)
        {
            if (temp[0] == '*')
            {
                /*cout << "遇到了*" << endl;*/
                break;
            }
            wordDelSpe(temp);
            for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
                if (temp[i] >= 'A'&& temp[i] <= 'Z')
                    temp[i] += 32;
            for (i = 0; i < wordNum; i++)
            {
                if (!strcmp(temp, sumWord[i].word))
                    sumWord[i].weight *= 1.5;
            }
        }
        fclose(file);//关闭文件
    }
    void doArticleAll(char *file0,char file[][50],char *savefile,int id,char *answers,int &answerNum,int &correctNum)
    {
        /*cout << "correctNum" << correctNum << endl;*/
        int i;
        char ans;    //答案
        doArticle(file0);    //处理题目文本
        fileCount(file);    //统计文件库中单词出现次数
        calWeight(sumWord, wordNum);    //计算权重
        sortWord();    //排序
        doArticleLocal(file0);//根据位置调整权值
        FILE *p = fopen(savefile, "w");        //文本输出    
        fprintf(p, "    word         	词频	文章数	权重
    ");            //输出到文档结果
        fprintf(p, "本文共%d个词,%d个不重复词
    ", sumWordNum, wordNum);
        for (i = 0; i < wordNum; i++)
        {
            fprintf(p, "%-16s	%d	%d	%f
    ", sumWord[i].word, sumWord[i].time, sumWord[i].textnum, sumWord[i].weight);
        }
        fclose(p);
        //doArticleLocal(file0);//根据位置调整权值
        scoreArticle(file0,id,answers,answerNum);
        std::cout << ""<<id+1<<"题结果成功输出到文件:" << savefile << endl;
        std::cout << "成功得到结果:" << endl;
        for (int h = 0; h < 4; h++)
        {
            cout << score[h] << endl;
        }
        int max = 0;
        for (int k = 1; k < 4; k++)
        {
            if (score[k] > score[max])
                max = k;
        }
        ans = 65 + max;
        cout << "答案是:" << ans ;
        if (answers[id] == ans)
        {
            cout <<" 正确"<<endl;
            correctNum++;
        }
        else
            cout << " 错误 (正确答案为:" << answers[id]<<"" << endl ;
        cout << endl << "-----------------------------" << endl;
        /*cout << "correctNum" << correctNum << endl;*/
    }
    void clearSumWord()//清空数组
    {
        for (int i = 0; i < 1000; i++)
        {
            sumWord[i].weight = 0;
            sumWord[i].time = 0;
            sumWord[i].textnum = 0;
            strcpy(sumWord[i].word, "");
        }
    }
    void main(int n, char *arg[])
    {
        char answers[textN] = { '' };
        int answerNum = 0,correctNum = 0, i = 0, j = 0;
        double corretPersent;        //正确率
        char textfile[textN][50] = { '' };    //题目名字
        char savefile[textN][50] = { '' };    //保存文件
        char file[N][50] = { '' };            //需要检索的文献
        //char *savefile1 = "savefile\save.txt";    //结果存放文档
        char *LibStop = "stopLib\stop.txt";    //停用词库                                    
        //char *file0 = "textfile\text1.txt";
        //char *file0 = "text2.txt";
        clock_t start0, finish0;            //程序运行时间
        double sftime0;
        start0 = clock();
        fileNameMake(file, N);
        textNameMake(textfile, textN);
        saveNameMake(savefile, textN);
        cout << endl;
        wordleng = DoLibStop(LibStop, Libword);//停用词处理
        /*cout << file0 << "hah" << endl;*/
        answerNum = 0;
        correctNum = 0;
        for(int k=0;k<textN;k++)
        { 
            doArticleAll(textfile[k], file, savefile[k],k,answers,answerNum,correctNum);
            clearSumWord();
        }
        /*cout << "correctNum" << correctNum << endl;
        cout << "answerNum" << answerNum << endl;*/
        corretPersent = correctNum*100.0 / textN;
        printf("共%d篇文章,正确率为%.2f%%
    ", textN, corretPersent);
        finish0 = clock();
        sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
        std::cout << endl<< "共用时间:" << sftime0 << "秒." << endl;
        system("pause");
    }


    第1题结果成功输出到文件:savefile\save1.txt
    成功得到结果:
    4.27272
    4.31105
    4.24789
    4.24789
    答案是:B 正确

    -----------------------------
    第2题结果成功输出到文件:savefile\save2.txt
    成功得到结果:
    4.30785
    4.31105
    4.25257
    4.32183
    答案是:D 错误 (正确答案为:C)

    -----------------------------
    第3题结果成功输出到文件:savefile\save3.txt
    成功得到结果:
    4.47317
    4.4314
    4.25882
    4.34237
    答案是:A 正确

    -----------------------------
    第4题结果成功输出到文件:savefile\save4.txt
    成功得到结果:
    7.15344
    4.4314
    6.94828
    7.02264
    答案是:A 错误 (正确答案为:B)

    -----------------------------
    第5题结果成功输出到文件:savefile\save5.txt
    成功得到结果:
    7.16518
    4.43581
    6.95683
    7.02264
    答案是:A 正确

    -----------------------------
    第6题结果成功输出到文件:savefile\save6.txt
    成功得到结果:
    7.16882
    4.43563
    6.97361
    7.05793
    答案是:A 错误 (正确答案为:C)

    -----------------------------
    第7题结果成功输出到文件:savefile\save7.txt
    成功得到结果:
    7.36186
    4.62905
    7.17293
    7.17759
    答案是:A 错误 (正确答案为:B)

    -----------------------------
    第8题结果成功输出到文件:savefile\save8.txt
    成功得到结果:
    7.40113
    4.63213
    7.21154
    7.23798
    答案是:A 错误 (正确答案为:B)

    -----------------------------
    第9题结果成功输出到文件:savefile\save9.txt
    成功得到结果:
    7.4557
    4.67378
    7.2737
    7.28944
    答案是:A 错误 (正确答案为:C)

    -----------------------------
    第10题结果成功输出到文件:savefile\save10.txt
    成功得到结果:
    7.55512
    4.67378
    7.2737
    7.28944
    答案是:A 错误 (正确答案为:D)

    -----------------------------
    共10篇文章,正确率为30.00%

    共用时间:111.989秒.
    请按任意键继续. . .

    输入文章第一段结尾和最后一段开头标记#,题目和答案标记*

  • 相关阅读:
    Hive与Hadoop的交互流程
    Hadoop Webhdfs
    Hadoop HDFS的Java操作
    Hadoop JobHistory
    使用Eclipse构建Maven项目环境搭建
    Shell脚本简介 — 持续更新
    Hadoop基础 — Hadoop Shell
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
  • 原文地址:https://www.cnblogs.com/BetterThanEver_Victor/p/6347836.html
Copyright © 2011-2022 走看看