zoukankan      html  css  js  c++  java
  • 歌词VSM实现!!!

    主体VSM_SetUP.cpp;

     1 /*[ar:歌词作者]
     2 [ti:歌词(歌曲)的标题]
     3 */
     4 //编写程序实现lrc(歌词) 文件的检索,检索模型要求采用向量空间模型。请将源程序和文档压缩后,一并上传。
     5 #include<iostream>
     6 
     7 #include"Document_Index.h"
     8 
     9 using namespace std;
    10 void main() {
    11     cout << "*****本程序实现一个向量空间模型,对(D:\暂时的)文件夹下的lrc文件进行遍历建立一个对应于歌曲名,作曲者,歌词主体的检索系统,请根据需要进行查询*****" << endl << endl;
    12     cout << "*****检索出的文档编号对应的文档名字请在(检索结果.txt)内查找*****" << endl;
    13     string query;
    14     int select,result;
    15     int isornot = 2;
    16     Doc_Analysis doc_analysis;
    17     doc_analysis.SETUP_Index();
    18     ReQuery getResult(&doc_analysis);
    19     
    20     while (isornot>=1) {
    21         cout << "输入查询词项:";
    22         if (isornot != 2) {
    23             getline(cin, query);
    24         }
    25         getline(cin, query);
    26         
    27         cout << "请选择查询模式(1为查歌曲,2为查歌手,3为查歌词主体):";
    28         cin >> select;
    29         cout << "请选择返回结果的数量:";
    30         cin >> result;
    31         getResult.Query(query, result, select);
    32         cout << "is or not(1表示继续查询,0表示退出查询) ?    " ;
    33         cin >> isornot;
    34     }
    35 }

    类的保存文件内:Document_Index.cpp和Document_Index.h

      1 #pragma once
      2 #include<iostream>
      3 #include<fstream>
      4 #include<vector>
      5 #include<math.h>
      6 #include<string>
      7 #include<iomanip>
      8 #include <stdio.h>
      9 #include<io.h>
     10 #include <windows.h>
     11 using namespace std;
     12 
     13 const static int Maxsize = 10000;
     14 const static int maxsize = 100;//a line and a smalllist
     15 
     16 //得分和相应的文档编号
     17 struct ScoreandDoc {
     18     float score;
     19     int text_number;
     20 };
     21 
     22 //包含词的所在的文档编号,词项在此文档出现的频率tf
     23 struct Word_Doc {
     24     int text_number;
     25     int text_fre;
     26 };
     27 
     28 //设计保存词项倒排记录头部的一个结构
     29 struct Index_List {//存储每个词的头项,包含单词和指向倒排记录的指针,存储df,df是包含词项的文档的数目,同时也是倒排记录的长度。next指向下一个词项
     30     float  df;
     31     string word;
     32     vector<Word_Doc> head_docID;// = nullptr;
     33     Index_List * next = nullptr;
     34 };
     35 
     36 //文档检索类,VSM的主体
     37 class Doc_Analysis {
     38 
     39     string BTEMP[Maxsize];
     40     string TEMP[maxsize];
     41     int arsize;//
     42     int tisize;
     43     Index_List * arofMusic_idList;  //歌曲作者索引的链表头
     44     Index_List * tiofMusic_idList;  //歌曲名字索引的链表头
     45     Index_List * idList;              //主体歌词倒排索引的链表头
     46     int size; //文档的词项的数目,即文档长度
     47     int allsize;//总的词数
     48      int N;                     //歌词主体文档集的大小,共有N篇文档
     49      int sizeofmusicname;          //歌词名字和歌词作者的大小
     50     string Inp_Temp_Lyrics[Maxsize];    //存放歌词主体
     51     string ti_Temp[maxsize/5];//歌曲名称和歌曲作者的暂时存放之地
     52     string ar_Temp[maxsize / 5];//歌曲名称和歌曲作者的暂时存放之地
     53 public:
     54     Doc_Analysis() {
     55         size = 0;
     56         allsize = 0;
     57         arsize = 0;
     58         tisize = 0;
     59         N = 0;
     60         idList = nullptr;
     61         arofMusic_idList = nullptr;
     62         tiofMusic_idList = nullptr;
     63     };
     64     ~Doc_Analysis() {
     65 
     66     };
     67 
     68 
     69     //把从文档中检索的词插入Inp_Temp_Words[Maxsize],如果不在就直接插入,如果已经存在则加一个
     70 
     71     //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。先对文档内歌词的作者和调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回文档的词数
     72     int Doc_input(string filename, int number);
     73 
     74     //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小,j表示其从哪一个数开始放入
     75     int Temp_Insert(string temp_words[],char T[],int &size);
     76 
     77     //对此文档的词项的表进行归并排序(按字典序)
     78     void Doc_mergesort(string *inputWord, string* Temp, int left, int right);
     79 
     80     //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中
     81     Index_List* insert_IndexList(string *inputWord, int n,int NofDoc, Index_List * idList);
     82 
     83     //歌曲名,作者,歌词主体倒排索引总体构建
     84     void SETUP_Index();// {};
     85 
     86     //返回最终查询的文档集大小
     87     int SizeOfDocSet() {
     88         return N;
     89     }
     90 
     91     //返回最终生成的歌词主体倒排索引表
     92     Index_List* tiIndex_head() {
     93         return tiofMusic_idList;
     94     };
     95 
     96     //返回最终生成的歌ming倒排索引表
     97     Index_List* DocIndex_head() {
     98         return idList;
     99     };
    100 
    101     //返回最终生成的作者倒排索引表
    102     Index_List* arIndex_head() {
    103         return arofMusic_idList;
    104     };
    105 
    106     //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径
    107     void GetAllFormatFiles(string path, vector<string>& files, string format);
    108 //
    109 };
    110 
    111 //对输入的查询词项进行分析,返回输入结果
    112 class ReQuery {
    113     Doc_Analysis* LMA;//歌词倒排索引的链表头
    114     float *Scores;//每个查询词项的初始得分
    115     float *arLength;//每个文档的长度
    116     float *tiLength;
    117     float *Length;
    118     int N;//文档集大小
    119 public:
    120     ReQuery(Doc_Analysis* TEMP) {//得到Doc_Analysis返回的文档集长度和链表头
    121         N = TEMP->SizeOfDocSet();
    122         Scores = new float[N];
    123         Length = new float[N];
    124         
    125         LMA = TEMP;
    126         for (int i = 0; i < N; i++) {
    127             Scores[i] = 0;
    128             Length[i] = 0;
    129         }
    130     };
    131     ~ReQuery() {
    132         delete[] Scores;
    133         delete[] Length;
    134         delete[] arLength;
    135         delete[] tiLength;
    136     };
    137 
    138     //查询所有词项,对所有倒排索引表遍历一次,将每个向量的长度计算出来,初始化得分数组
    139     void initialLength(Index_List * idList, float *tempLength);
    140 
    141     //输入查询歌词词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体
    142     bool Query(string query, int k,int select=1);
    143 
    144     //查询某一个词是否在其中,有则返回其df,无则返回0
    145     int isInner(string elem, Index_List idList[],Index_List &nowTemp);
    146 
    147     //某个词项对于suoyou文档的得分
    148     float ScoreofaDoc(Index_List *idList, Index_List* word, int iQ);
    149 
    150     //进行堆排序,将所有的得分进行排序
    151     bool HeapSort(float Scores[], int n,int k);
    152 
    153 };
    154 
    155 //堆排序,建立最大堆
    156 class Max_Heap {
    157     ScoreandDoc *Heap;
    158     int size;
    159     int n;
    160     void siftdown(int elem);
    161 public:
    162     Max_Heap(int num, int max, ScoreandDoc *temp) {
    163         n = num;
    164         size = max;
    165         Heap = temp;
    166         buildHeap();
    167     };
    168     void buildHeap() {
    169         for (int i = n / 2 - 1; i >= 0; i--)
    170             siftdown(i);
    171     };
    172     int heapsize()const {
    173         return n;
    174     }
    175     bool isLeaf(int pos)const {
    176         return (pos >= n / 2) && (pos < n);
    177     }
    178     int leftchild(int pos)const {
    179         return 2 * pos + 1;
    180     }
    181     int rightchild(int pos)const {
    182         return 2 * pos + 2;
    183     }
    184     ScoreandDoc removemax(float it);
    185 
    186 };

    ——————————————————————————————————————————————————————————————————————————————————

      1 #pragma once
      2 #include"Document_Index.h"
      3 #include<iostream>
      4 #include<fstream>
      5 #include<vector>
      6 #include<math.h>
      7 #include<string>
      8 #include<iomanip>
      9 #include <stdio.h>
     10 #include<io.h>
     11 #include <windows.h>
     12 using namespace std;
     13 
     14 ///////////类:Doc_Analysis///////////////////////
     15 
     16 //打开文件输入歌词,对词项进行分析,把歌曲的作者和歌曲名取出,存入对应的倒排索引,但是这个索引很小,所以可以直接构建倒排索引表。参数分别为:文件名,文档的编号。调用分离出的词项最终存储在Inp_Temp_Words[Maxsize],返回歌词文档的词数
     17 int Doc_Analysis::Doc_input(string filename, int number) {
     18     ifstream fin(filename);
     19     if (!fin.is_open()) {
     20         exit(0);
     21     }
     22     N++;
     23 //    cout << "此文件夹文档数目:" << N << endl;
     24     char c[maxsize] = { '' };
     25     int ic = 0, i = 2;
     26     int numberofDoc = 0;
     27     string str;
     28     getline(fin, str);
     29 
     30     //分离出作者
     31     for (; str[i] != ']'; i++) {
     32         if (str[i] == ':'&&str[i - 1] == 'r'&&str[i - 2] == 'a') {
     33             for (; str[i] != ']'; i++) {
     34                 if (((int)str[i] >= 65) && ((int)str[i] <= 91))
     35                     c[ic++] = (int)str[i] + 32;
     36                 else
     37                     c[ic++] = str[i];
     38             }
     39             i--;
     40         }
     41     }
     42     Temp_Insert(ar_Temp,c,arsize);
     43     
     44     Doc_mergesort(ar_Temp, TEMP, 0, arsize - 1);
     45     
     46     
     47     //分离出歌名
     48     getline(fin, str);
     49     i = 2;
     50     ic = 0;
     51     for (; str[i] != ']'; i++) {
     52         if (str[i] == ':'&&str[i - 1] == 'i'&&str[i - 2] == 't') {
     53             for (; str[i] != ']'; i++) {
     54                 if (((int)str[i] >= 65) && ((int)str[i] <= 91))
     55                     c[ic++] = (int)str[i] + 32;
     56                 else
     57                     c[ic++] = str[i];
     58             }
     59             i--;
     60         }
     61     }
     62 
     63     //cout << "歌名:" << c << endl;
     64     Temp_Insert(ti_Temp,c, tisize);
     65     
     66     //遍历整个geci文档主体,每次读取一行,然后进行分析
     67     getline(fin, str);
     68     
     69     do {
     70         //cout <<"收到: " <<str << endl;
     71         ic = 0;
     72         for (i = 0; str[i] != ']'; i++);
     73         for (i++; str[i] != '
    '&&str[i] != '
    '&&str[i] != ''; i++) {
     74 
     75             //去掉引号后面的字符,但是如果是t的话就不去
     76             if ((int)str[i] == 39) {
     77                 while (str[i] != ' '&&str[i] != '
    '&&str[i] != '
    '&&str[i] != '') {
     78                     i++;
     79                     if (str[i] == 't') {
     80                         i--;
     81                         break;
     82                     }
     83                 }
     84                 if (str[i] == '
    ' || str[i] == '
    ' || str[i] == '')
     85                     break;
     86             }
     87 
     88             //除去大小写
     89             if (((int)str[i] >= 65) && ((int)str[i] <= 91))
     90                 c[ic++] = (int)str[i] + 32;
     91             else
     92                 c[ic++] = str[i];
     93         }
     94         c[ic] = '';
     95         
     96         Temp_Insert(Inp_Temp_Lyrics, c, numberofDoc);
     97         getline(fin, str);
     98     } while (!fin.eof());
     99     fin.close();
    100     allsize += numberofDoc;
    101 
    102     //cout << "本文档最终分离出词数:" << size << endl;
    103     size = numberofDoc;
    104     return numberofDoc;
    105 };
    106 
    107 //被int Doc_input(string filename, int number);调用,将分离出的词项存储在temp_Words[]中,size表示其大小
    108 int Doc_Analysis::Temp_Insert(string temp_words[], char T[],int &size) {
    109     const char *d = "[] -;,:/?!.()"";//以这些字符为分界符[] -;,:/?!.()"
    110     char *p = NULL;
    111     char *next_p = NULL;
    112     p = strtok_s(T, d, &next_p);
    113     while (p)
    114     {
    115         //cout << p << endl;
    116         temp_words[size++] = p;//put the char* into temp table
    117         p = strtok_s(NULL, d, &next_p);
    118     }
    119     
    120     return size;
    121 };
    122 
    123 //对此文档的词项的表进行归并排序(按字典序)
    124 void Doc_Analysis::Doc_mergesort(string *inputWord, string* Temp, int left, int right) {
    125     int i, j, k, mid = (left + right) / 2;
    126     if (left == right)
    127         return;
    128     Doc_mergesort(inputWord, Temp, left, mid);
    129     Doc_mergesort(inputWord, Temp, mid + 1, right);
    130     for (i = mid; i >= left; i--)
    131         Temp[i] = inputWord[i];
    132     for (j = 1; j <= right - mid; j++)
    133         Temp[right - j + 1] = inputWord[j + mid];
    134     for (i = left, j = right, k = left; k <= right; k++)
    135         if (Temp[i]<= Temp[j])
    136             inputWord[k] = Temp[i++];
    137         else
    138             inputWord[k] = Temp[j--];
    139 };
    140 
    141 //将此次输入的文档分词排序后得到的词项表存入最终的倒排索引中,numberofDoc为此文档分离出的词的数目,(不是词项)NofDoc为文档的编号
    142 Index_List* Doc_Analysis::insert_IndexList(string *inputWord, int numberofDoc,int NofDoc, Index_List * idListx) {
    143     int i = 0, j = 0;
    144     Index_List* pre_idList = idListx,*idList=idListx, *idListHead = idListx;    
    145     //cout << "词数" << numberofDoc << endl;
    146     if (i < numberofDoc) {
    147         //cout << " 当前文档的词: " << inputWord[i] << endl;
    148         while ((idList != nullptr)&&(i<numberofDoc)) {//将整个倒排索引在此遍历完全,在文档也未结束的情况下
    149             //1.词项和目前监测的节点值一样,则直接在其后的此词项的后面加上本文档的相关信息即可
    150             if (inputWord[i] == idList->word) {
    151                 //cout << "此时词项" << inputWord[i] << "已存在索引表中" << endl;
    152                 Word_Doc *temp = new Word_Doc;
    153                 temp->text_number = NofDoc;//这个词项的文档编号,把所有相同的词项合并在一起
    154                 temp->text_fre = 0;//肯定已经在这个文档出现了一次
    155                 do {
    156                     temp->text_fre++;
    157                     i++;
    158                     if (i == numberofDoc)
    159                         break;
    160                 } while (inputWord[i] == idList->word);//只有当文档检测的词项不一样时退出
    161 
    162                 idList->df++;//出现该词项的文档数增1,应该为df
    163 
    164                 idList->head_docID.push_back(*temp);
    165                 
    166                 /*cout << (idList->head_docID)[idList->head_docID.size() - 1].text_number << endl;*/
    167                 
    168                 pre_idList = idList;//前一个链表值
    169 
    170                 //print2(idList);//查看这个idList的具体值
    171 
    172                 idList = idList->next;//索引表下移
    173             }
    174             //2.当这个词项比当前索引的词项小时,说明词项肯定在倒排索引中排在当前词项的前面,则将其插入在其之前,注意区分第一个和中间的
    175             else if (inputWord[i] < idList->word) {
    176                 //cout << inputWord[i] << "比索引表的——" << idList->word << " 小" << endl;
    177                 Index_List* newidList = new Index_List;
    178                 vector<Word_Doc> forID ;//因为是单独建一个词项的索引,故建立存储倒排索引的容器
    179                 Word_Doc *temp = new Word_Doc;
    180                 temp->text_number = NofDoc;
    181                 temp->text_fre = 0;//肯定已经出现过一次,把所有相同的词项合并在一起
    182                 if (idList->word==pre_idList->word) {
    183                     //cout << "这个词即将插入索引头。" << endl;
    184                     idListHead = newidList;
    185                 }
    186                 else {
    187                     pre_idList->next = newidList;
    188                 }
    189                 do {
    190                     temp->text_fre++;
    191                     i++;
    192                     if (i == numberofDoc)
    193                         break;
    194                 } while (inputWord[i] == inputWord[i - 1]);//只有当文档检测的词项不一样时退出
    195                 
    196                 forID.push_back(*temp);
    197 
    198                 newidList->df = 1;
    199                 newidList->next = idList;
    200                 
    201                 pre_idList = newidList;
    202                 newidList->word = inputWord[i-1];
    203                 newidList->head_docID = forID;
    204 
    205 
    206             }
    207             //3.当目前文档的词比索引的词项大时,倒排索引表向后走
    208             else {
    209                 //cout << inputWord[i] << "比索引表的——" << idList->word << " 大" << endl;
    210 
    211                 pre_idList = idList;
    212                 //cout << idList->word << endl;
    213                 idList = idList->next;
    214             }
    215         }
    216         //idList==nullptr,,,if条件句成立意味着倒排索引表已经到达尾部,接下来的所有词项都大于索引表内任何词项,可以直接插入,注意区分第一个和中间的
    217         while (i < numberofDoc) {
    218             idList = new Index_List;
    219             if (idListHead == nullptr) {//如果是
    220                 pre_idList = idList;
    221                 idListHead = idList;
    222             }
    223             else {
    224                 pre_idList->next = idList;
    225             }
    226             
    227             vector<Word_Doc> forID;//建立存储这个词项的倒排索引的容器
    228             Word_Doc *temp = new Word_Doc;
    229             temp->text_number = NofDoc;
    230             temp->text_fre = 0;
    231             do {
    232                 temp->text_fre++;
    233                 i++;
    234                 if (i == numberofDoc)
    235                     break;
    236                 
    237             } while (inputWord[i] == inputWord[i - 1]);//把所有相同的词项合并在一起,只有词项不一致时才退出
    238             forID.push_back(*temp);
    239 
    240             idList->df = 1;
    241             
    242             pre_idList = idList;
    243             idList->word = inputWord[i-1];
    244             idList->head_docID = forID;
    245 
    246             //print2(idList); 
    247             idList = idList->next;
    248         }
    249     }
    250     //print1(idListHead);
    251     return idListHead;
    252 };
    253 
    254 //歌曲名,作者,歌词主体倒排索引总体构建
    255 void Doc_Analysis::SETUP_Index(){
    256     int i;
    257     string tx_filePath = "", filePath = "D:\暂时的", distAll = "检索结果.txt", format = ".lrc";
    258     vector<string> files;
    259 
    260     GetAllFormatFiles(filePath, files, format);
    261     distAll = filePath + "\" + distAll;
    262     ofstream ofn(distAll);
    263     int tsize = files.size();
    264     cout << "文件夹下的.lrc数目:" << tsize << endl;//查询出文件夹下文档的数目
    265     for (i = 0; i < tsize; i++)//一次遍历,每检索一个文档将其存入相应的缓冲区,然后建立倒排索引
    266     {
    267         ofn <<"文档"<<i<<""<< files[i] << endl; // 写入文件  
    268         Doc_input(files[i], i);
    269         Doc_mergesort(ar_Temp, TEMP, 0, arsize-1);
    270         Doc_mergesort(ti_Temp, TEMP, 0, tisize-1);
    271         Doc_mergesort(Inp_Temp_Lyrics,BTEMP, 0, size-1);
    272     
    273         //插入倒排索引
    274 
    275         arofMusic_idList = insert_IndexList(ar_Temp, arsize, i, arofMusic_idList);
    276         tiofMusic_idList = insert_IndexList(ti_Temp, tisize, i, tiofMusic_idList);
    277         idList=insert_IndexList(Inp_Temp_Lyrics, size, i, idList);
    278     
    279 
    280         arsize = 0;
    281         tisize = 0;
    282         size = 0;
    283         
    284         //cout << "索引链表内容如下:" << endl;
    285         //cout << "作者:" << endl;
    286         //print1(arofMusic_idList);
    287         //cout << "歌名:" << endl;
    288         //print1(tiofMusic_idList);
    289         //cout << "主体:" << endl;
    290         //print1(idList)
    291         //insert_IndexList(ti_Temp, arsize, i, tiofMusic_idList);
    292         //insert_IndexList(Inp_Temp_Lyrics, arsize, i, idList);
    293     }
    294     //至此,索引构建完毕
    295     /*cout << "主体:" << endl;
    296     print1(idList);*/
    297     ofn <<endl<< "文件夹下的.lrc数目:" << tsize << endl;
    298     cout << endl;
    299     ofn << "检索出词数(非词项数):" << allsize << endl;
    300     ofn.close();
    301     cout << "一共检索出词数(非词项数):" << allsize << endl;
    302     cout << "歌曲名索引构建完毕!!!" << endl;
    303     cout << "作曲者索引构建完毕!!!" << endl;
    304     cout << "歌词主体索引构建完毕!!!" << endl;
    305     cout << endl;
    306 };
    307 
    308 //此函数实现寻找指定文件夹下的指定后缀文件,并且保存其完整的路径
    309 void Doc_Analysis::GetAllFormatFiles(string path, vector<string>& files, string format)
    310 {
    311     //文件句柄    
    312     long   hFile = 0;
    313     //文件信息    
    314     struct _finddata_t fileinfo;
    315     string p;
    316     if ((hFile = _findfirst(p.assign(path).append("\*" + format).c_str(), &fileinfo)) != -1)
    317     {
    318         do
    319         {
    320             if ((fileinfo.attrib &  _A_SUBDIR))
    321             {
    322                 if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)
    323                 {
    324                     files.push_back(p.assign(path).append("\").append(fileinfo.name));
    325                     GetAllFormatFiles(p.assign(path).append("\").append(fileinfo.name), files, format);
    326                 }
    327             }
    328             else
    329             {
    330                 files.push_back(p.assign(path).append("\").append(fileinfo.name));; //将文件路径保存,也可以只保存文件名:    p.assign(fileinfo.name)
    331             }
    332         } while (_findnext(hFile, &fileinfo) == 0);
    333 
    334         _findclose(hFile);
    335     }
    336 };
    337 
    338 
    339 ////////////////////类:ReQuery////////////////////
    340 
    341 //查询所有词项,对倒排索引表遍历一次,初始化得分数组
    342 
    343 void ReQuery::initialLength(Index_List * idList, float *tempLength) {
    344     float idf;//记录log N/df
    345     int size;
    346     int i;
    347     
    348     while (idList != nullptr) {
    349         idf = log(N/idList->df) / log(10);
    350         size = idList->head_docID.size();
    351         i = 0;
    352         for (; i < size; i++) {
    353             tempLength[idList->head_docID[i].text_number] += (idf*(1 + log(idList->head_docID[i].text_fre) / log(10)))*(idf*(1 + log(idList->head_docID[i].text_fre) / log(10)));
    354         }
    355         idList = idList->next;
    356     }
    357     for (i = 0; i < N; i++)
    358         tempLength[i] = sqrt(tempLength[i]);
    359     /*for (int i = 0; i < LMA->SizeOfDocSet();i++)
    360         cout << "文档" << i <<" 长度为 "<< tempLength[i] << endl;*/
    361 }
    362 
    363 //输入查询词项,输出查询结果,返回排名前k的文档编号,select 为选择的查询模式,1为查歌名,2为查歌手,3为查歌词主体
    364 bool ReQuery::Query(string query, int k, int select) {
    365     Index_List QUERY[10];
    366     char aa[50] = { '' };
    367     int ia = 0, i = 0, iQ = 0, j = 0;
    368     
    369     for (; i<query.length(); i++) {
    370         while (query[i] != ' '&& i < query.length()) {
    371             if (((int)query[i] >= 65) && ((int)query[i] <= 91))
    372                 aa[ia++] = query[i++] + 32;
    373             else
    374                 aa[ia++] = query[i++];
    375         }
    376             
    377         aa[ia] = '';
    378         for (; j < iQ; j++) {
    379             if (QUERY[j].word == aa) {
    380                 QUERY[j].df++;
    381                 j = -1;
    382                 break;
    383             }
    384         }
    385         if (j != -1) {
    386             QUERY[iQ++].word = aa;
    387             QUERY[iQ-1].df = 1;
    388         }
    389         j = 0;
    390         ia = 0;
    391         //cout << f[ic1 - 1] << endl;
    392     }
    393 
    394     //查歌名
    395     if (select == 1) {
    396         initialLength(LMA->tiIndex_head(), Length);
    397         //计算查询de得分
    398         ScoreofaDoc(LMA->tiIndex_head(), QUERY, iQ);
    399     }
    400     //查作者
    401     if (select == 2) {
    402         initialLength(LMA->arIndex_head(), Length);
    403         //计算查询de得分
    404         ScoreofaDoc(LMA->arIndex_head(), QUERY, iQ);
    405     }
    406     //查歌词主体
    407     if (select == 3) {
    408         initialLength(LMA->DocIndex_head(), Length);
    409         //计算查询de得分
    410         ScoreofaDoc(LMA->DocIndex_head(), QUERY, iQ);
    411     }
    412     /*for (int i = 0; i < N; i++) {
    413         cout << Scores[i] << endl;
    414     }*/
    415     //对得分数组建堆,并且返回前K个
    416     HeapSort(Scores, N, k);
    417     return true;
    418 };
    419 
    420 //词项对于suoyou文档的得分
    421 float ReQuery::ScoreofaDoc(Index_List *idList, Index_List Tword[],int iQ) {
    422     int size = 0;
    423     int df;
    424     float idf;
    425     Word_Doc TEMPS;
    426     Index_List nowTemp;
    427     for (int i = 0; i < iQ;i++) {
    428         
    429         if ((df = isInner(Tword[i].word,idList,nowTemp)) != 0) {
    430             idf = log(N/df) / log(10);
    431             size =nowTemp.head_docID.size();
    432             for (int j = 0; j < size; j++) {
    433                 TEMPS = (nowTemp.head_docID)[j];
    434                 Scores[TEMPS.text_number] += ((idf)*Tword[i].df)*(idf*(1 + log(nowTemp.head_docID[j].text_fre) / log(10)));
    435             }
    436         }
    437     }
    438     cout <<endl <<"各文档依次得分:" << endl;
    439     for (int i = 0; i < N; i++) {
    440         if(Scores[i]!=0)
    441             Scores[i] = Scores[i]/ Length[i];
    442         cout << Scores[i] <<" ";
    443     }
    444     return 0;
    445 }
    446 
    447 //查询某一个词是否在其中,有则返回其df,无则返回0
    448 int ReQuery::isInner(string elem, Index_List idList[], Index_List &nowTemp) {
    449     int i = 0;
    450     //cout << "查找单词" << elem << endl;
    451     while (idList != nullptr) {
    452         if (idList[i].word == elem) {
    453             //cout << "单词" << elem << "在其中,文档编号"<<idList[i].head_docID[0].text_number<<endl;
    454             nowTemp = idList[i];
    455             return idList->df;
    456         }
    457         if (idList->word > elem)
    458             return 0;
    459         idList = idList->next;
    460     };
    461     return 0;
    462 }
    463 
    464 //进行堆排序,将所有的得分进行排序,找出前k个,n为数组大小
    465 bool ReQuery::HeapSort(float Scores[],int n,int k) {
    466     float doc = -1;
    467     int i = 0;
    468     ScoreandDoc *TScores = new ScoreandDoc[n];
    469     for (; i < n; i++) {
    470         TScores[i].score = Scores[i];
    471         TScores[i].text_number = i;
    472     }
    473 
    474     Max_Heap H(n, n, TScores);
    475     i = 0;
    476     cout << endl << endl << "---------------------向您推荐如下文档--------------------------------" << endl;
    477     for (; i < k; i++) {
    478         ScoreandDoc temp= H.removemax(doc);
    479         cout << "文档编号:" << temp.text_number << " 得分:" << temp.score << endl;
    480     }
    481 
    482     cout << endl;
    483     delete[]TScores;
    484     return true;
    485 
    486 };
    487 
    488 //////////////////////////////lei Max_Heap///////////////////////////
    489 //堆建立
    490 
    491 //建立整堆
    492 void Max_Heap::siftdown(int pos) {
    493     while (!isLeaf(pos)) {
    494         int j = leftchild(pos);
    495         int rc = rightchild(pos);
    496         if ((rc < n) && (Heap[j].score < Heap[rc].score)) {
    497             j = rc;
    498         }
    499         if (!(Heap[pos].score < Heap[j].score))
    500             return;
    501         ScoreandDoc xxx = Heap[pos];
    502         Heap[pos] = Heap[j];
    503         Heap[j] = xxx;
    504         pos = j;
    505     }
    506 };
    507 
    508 
    509 //每次找出移除最大的
    510 ScoreandDoc Max_Heap::removemax(float it) {
    511     //if (n == 0)
    512     //    return ;
    513     ScoreandDoc xxx = Heap[--n];
    514     Heap[n] = Heap[0];
    515     Heap[0] = xxx;
    516     if (n != 0)
    517         siftdown(0);
    518     //it = Heap[n].score;
    519     return Heap[n];
    520 }
  • 相关阅读:
    使用密码解密TACACS+的报文
    C9K Stackwise Virtual(三)
    Webhook Configuration Example
    sup-bootflash和bootflash
    WLC5508 license没有500个?
    AAA Server Groups
    关于FlexConnect的Bug!
    Bug搬运工-CSCve57121--Cisco 2800, 3800 and 1560 series APs fail to pass traffic
    Bug搬运工-CSCvb29354-1810 OEAP cannot join vWLC
    阿里云云计算认证ACP模拟考试练习题第1套模拟题分享(共10套)
  • 原文地址:https://www.cnblogs.com/1996313xjf/p/6056012.html
Copyright © 2011-2022 走看看