zoukankan      html  css  js  c++  java
  • K-meams文本聚类算法C++实现


    FROM:http://www.cnblogs.com/finallyliuyu/archive/2010/09/03/1817348.html

      1 头文件:
      2 
      3 
      4  #ifndef _Preprocess_H
      5 #define  _Preprocess_H
      6 #include
      7 #include
      8 #include
      9 #include
     10 #include
     11 #include
     12 #include
     13 #include
     14 #include
     15 #include
     16 #include
     17 #include
     18 #include"ictclas30.h"
     19 #include"boost	r1
    egex.hpp"
     20 #include"boost/algorithm/string.hpp"
     21 #include"windows.h"
     22  
     23 //一些谓词函数
     24 using namespace std;
     25  
     26 class Preprocess
     27 {      
     28     //typedef  vector(Preprocess::*FUNCSEG)(string,set);
     29     private:
     30          char *bagofwordsAddress;//存放词袋子模型的位置
     31         char * featurewordsAddress;//存放特征词文件的位置;
     32         char *arffFileAddress;//存放ARFF文件的位置
     33         char *infoFromWekaAddress;//存放调用weka后的实验结果
     34         char *articleIdsAddress;//存放被聚类的文章的ID号
     35         char *dbconnection;//数据库的链接字符串
     36         char *dbselect;//数据库select语句
     37         char *dbfield;//数据库字段
     38         int beginIndex;//开始聚类的文章id
     39         int endIndex;//结束聚类的文章id
     40     public:
     41         typedef vector(Preprocess::*FUNCSEG)(string,set);
     42         Preprocess(int c_style_stringsize,const char *mydict,const char *keywordsinfo,const char *tobeCluster,const char * InfoFromWeka,const char *artileIds,const char *conn,const char *selectsql, int beginIndex,int endIndex)
     43         {
     44                 bagofwordsAddress=new char[c_style_stringsize];
     45                 featurewordsAddress=new char[c_style_stringsize];
     46                 arffFileAddress=new char[c_style_stringsize];
     47                 infoFromWekaAddress=new char[c_style_stringsize];
     48                 articleIdsAddress=new char[c_style_stringsize];
     49                 dbconnection=new char[c_style_stringsize];
     50                 dbselect=new char[c_style_stringsize];
     51                 this->beginIndex=beginIndex;
     52                 this->endIndex=endIndex;
     53                 sprintf_s(bagofwordsAddress,c_style_stringsize,mydict);
     54                 sprintf_s(featurewordsAddress,c_style_stringsize,keywordsinfo);
     55                 sprintf_s(arffFileAddress,c_style_stringsize,tobeCluster);
     56                 sprintf_s(infoFromWekaAddress,c_style_stringsize,InfoFromWeka);
     57                 sprintf_s(articleIdsAddress,c_style_stringsize,artileIds);
     58                 sprintf_s(dbconnection,c_style_stringsize,conn);
     59                 sprintf_s(dbselect,c_style_stringsize,selectsql);
     60              
     61  
     62  
     63         }
     64         
     65  
     66         ~Preprocess()
     67         {
     68             delete []bagofwordsAddress;
     69             delete []featurewordsAddress;
     70             delete []arffFileAddress;
     71             delete [] infoFromWekaAddress;
     72             delete []articleIdsAddress;
     73             delete []dbconnection;
     74             delete []dbselect;
     75              
     76  
     77         }
     78         void trim(string  &str,const string val);//去除字符串首尾空白
     79         //构建倒排表: key=word,val= a list of pairs which consists of articleid,and count, count=tf
     80         int ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg);
     81         inline void TruncateArff()
     82         {
     83             ofstream ofile;
     84             ofile.open(arffFileAddress,ios::trunc);
     85             ofile.close();
     86         }
     87         //保存词袋子到硬盘
     88         void save(mapint,int> > >&mymap);
     89         //从内存中加载词袋子模型
     90         void load(mapint,int> > >&mymap);
     91         //打印词袋子模型
     92         void print(mapint,int> > >&mymap);
     93         //窄字符串转化成宽字符串
     94         wstring myMultibyteToWideChar(string sResult);
     95         //宽字符串转化成窄字符串
     96         string myWideCharToMultibyte(wstring wsResult);
     97         //调用ICTclass分词
     98         string ICTsplit(const char *sInput);
     99         //构造停用词表
    100         setMakeStopSet();
    101         //去除停用词,噪声词
    102         vectorgoodWordsinPieceArticle(string rawtext,set stopwords);
    103         //整数转化成字符串
    104         string do_fraction(int val);
    105         //浮点数转化成字符串
    106         string do_fraction(double val, int decplaces=5);
    107         //特征词选择算法
    108         void DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold);
    109         //获取最后的特征词
    110         vector GetFinalKeyWords();
    111         //获取特征词的maxTF,DF
    112         vectorint,int> >GetfinalKeysMaxTFDF(mapint,int>>> &mymap);
    113         //文档向量模型规范化
    114         vectorint,double> > NormalizationVSM(vectorint,double> > tempVSM);
    115         //建立文档向量模型并且写到arff文件里
    116         void VSMFormation(mapint,int>>> &mymap);
    117         
    118         string FormatVSMtoString(vectorint,double> > tempVSM);
    119         //写Arff文件头部
    120         void WriteHeadArff();
    121         void WriteTotalArff(char * dbfield,int DFthreshlod,bool isbagOfwordsexsist,FUNCSEG seg);
    122         
    123         
    124         map<</code>int,vector<</code>double> >VSMConstruction(mapint,int>>> &mymap);
    125         
    126         map<</code>double> > GetClusters();
    127         
    128         double CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);
    129         
    130         double CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2);
    131         
    132         vectorint,string> >GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters);
    133         
    134         map<</code>int> >FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo);
    135         void RetreiveArticleInfoFromDataBase();
    136         vector mySplit(string s,set stopwords);//分割关键词
    137  
    138          
    139  
    140  
    141  
    142  
    143  
    144  
    145 };
    146  
    147  
    148  
    149 #endif 
    150 
    151 
    152  Preprocess类的函数功能实现文件: 
    153 
    154 
    155 
    156 
    157  #include"stdafx.h"
    158 #include "Preprocess.h"
    159  
    160 #pragma comment(lib, "ICTCLAS30.lib")
    161 using namespace std;
    162 bool isLonger(const  pairint> &pair1, const pairint>  &pair2)
    163 {
    164     return pair1.second>pair2.second;
    165 }
    166 bool cntAssist(const  pairint> &pair1)
    167 {
    168     return pair1.second<=100;
    169 }
    170 bool PredTF(const pair<</code>int,int>& pair1,int articleId)
    171 {
    172     return pair1.first==articleId;
    173  
    174 }
    175 class PredTFclass
    176 {
    177 private: const int m;
    178 public:
    179     PredTFclass(int id):m(id){};
    180     bool operator()(const pair<</code>int,int>& pair1){return PredTF(pair1,m);};
    181 };
    182 bool myCmp(const pairdouble>&pair1,const pairdouble>&pair2 )
    183 {
    184     return pair1.second>=pair2.second;
    185 }
    186  
    187 void Preprocess:: trim(string  &str,const string val)
    188 {
    189     str.erase(0,str.find_first_not_of(val));
    190     str.erase(str.find_last_not_of(val)+val.size());
    191 }
    192 int Preprocess::ConstructMap(mapint,int>>>&mymap,char *dbfield,FUNCSEG seg)
    193 {
    194     //setMakeStopSet();
    195     CoInitialize(NULL);
    196     _ConnectionPtr pConn(__uuidof(Connection));
    197     _RecordsetPtr pRst(__uuidof(Recordset));
    198     pConn->ConnectionString=dbconnection;
    199     pConn->Open("","","",adConnectUnspecified);
    200     pRst=pConn->Execute(dbselect,NULL,adCmdText);
    201     setstopwords=MakeStopSet();
    202      
    203     while(!pRst->rsEOF)
    204     {   vectorwordcollection;
    205        //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
    206         string rawtext=(_bstr_t)pRst->GetCollect(dbfield);
    207         if(rawtext!="")
    208         {
    209             wordcollection=(this->*seg)(rawtext,stopwords);
    210             string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
    211             int articleid=atoi(tempid.c_str());
    212             for(vector::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
    213             {
    214                 vectorint,int>>::iterator it;
    215                 if(mymap[*strit].empty())
    216                 {
    217                     pair<</code>int,int>mytemppair=make_pair(articleid,1);
    218                     mymap[*strit].push_back(mytemppair);
    219  
    220                 }
    221                 else
    222                 {
    223                     for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
    224                     { 
    225                         if(it->first==articleid)
    226                         {
    227                             it->second=++(it->second);
    228                             break;
    229                         }
    230  
    231                 }
    232                 if(it==mymap[*strit].end())
    233                 {
    234                     pair<</code>int,int>mytemppair=make_pair(articleid,1);
    235                     mymap[*strit].push_back(mytemppair);
    236                 }
    237  
    238             }
    239  
    240         }
    241  
    242  
    243     }
    244  
    245  
    246     pRst->MoveNext();
    247     wordcollection.clear();
    248  }
    249     pRst->Close();
    250     pConn->Close();
    251     pRst.Release();
    252     pConn.Release();
    253     CoUninitialize();
    254      
    255     return 0;
    256  
    257 }
    258 void Preprocess::save(mapint,int> > >&mymap)
    259 {
    260     ofstream outfile(bagofwordsAddress,ios::binary);
    261     outfile<<mymap.size()<<endl;
    262     mapint,int> > >::iterator it;
    263     for (it=mymap.begin();it!=mymap.end();it++)
    264     {   outfile<<it->first<<endl;
    265     vectorint,int>>::iterator subit;
    266     outfile<<it->second.size()<<endl;
    267     for(subit=(it->second).begin();subit!=(it->second).end();++subit)
    268     {
    269         outfile<<subit->first<<" "<<subit->second<<" "<<";"<<" ";
    270     }
    271     outfile<<endl;
    272     }
    273     //outfile.write((char *)&mymap,sizeof(mymap));
    274  
    275     outfile.close();
    276  
    277 }
    278 void Preprocess::load(mapint,int> > >&mymap)
    279 {
    280     std::locale loc1 = std::locale::global(std::locale(".936"));
    281     {
    282         // 在这里使用std::ifstream 或者 std::fstream
    283         ifstream infile(bagofwordsAddress,ios::binary);
    284         int lenMyMap;//保存词典长度
    285         int lenVector;//保存每个词出现的文章数目
    286         string key;//保存读出的map的键值
    287         int articleId;//文章标号
    288         int count;//在该文章中刚出现的数目
    289         string comma;
    290         string semicolon;
    291         infile>>lenMyMap;
    292         while(!infile.eof())
    293         {
    294             infile>>key;
    295             infile>>lenVector;
    296             vectorint,int> >temp;
    297             for (int i=0;i
    298             {
    299                 infile>>articleId>>count>>semicolon;
    300                 temp.push_back(make_pair(articleId,count));
    301             }
    302             mymap[key]=temp;
    303  
    304  
    305         }
    306  
    307  
    308         infile.close();
    309     }
    310     std::locale::global(std::locale(loc1));
    311  
    312 }
    313 void print(mapint,int> > >&mymap)
    314 {  
    315     cout<<mymap.size()<<endl;
    316     mapint,int> > >::iterator it;
    317     for (it=mymap.begin();it!=mymap.end();it++)
    318     {   cout<<it->first<<endl;
    319     vectorint,int>>::iterator subit;
    320     cout<<it->second.size()<<endl;
    321     for(subit=(it->second).begin();subit!=(it->second).end();++subit)
    322     {
    323         cout<<subit->first<<','<<subit->second<<";";
    324     }
    325     cout<<endl;
    326     }
    327  
    328 }
    329 set Preprocess::MakeStopSet()
    330 {
    331     set stopwordsSet;
    332     ifstream ifile("stopwords.txt");
    333     while(!ifile.eof())
    334     {
    335         string temp;
    336         trim(temp," ");
    337         ifile>>temp;
    338         stopwordsSet.insert(temp);
    339     }
    340     return stopwordsSet;
    341 }
    342  
    343 string Preprocess::do_fraction(int val)
    344 {
    345     ostringstream out;
    346     out<<val;
    347     string str= out.str(); //从流中取出字符串
    348     str.swap(string(str.c_str()));//删除nul之后的多余字符
    349     return str;
    350  
    351 }
    352 string Preprocess::do_fraction(double val,int decplaces)
    353 {
    354      
    355     //int prec=numeric_limits::digits10;
    356     char DECIMAL_POINT='.';
    357     ostringstream out;
    358     //out.precision(prec);
    359     out<<val;
    360     string str=out.str();
    361     size_t n=str.find(DECIMAL_POINT);
    362     if((n!=string::npos)&&n+decplaces
    363     {
    364         str[n+decplaces]='';
    365     }
    366     str.swap(string(str.c_str()));
    367  
    368     return str;
    369 }
    370 wstring Preprocess::myMultibyteToWideChar(string sResult)
    371 {
    372     int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。(不包含字符串结束符)
    373     wchar_t *lpwsz= new wchar_t [iWLen+1];
    374     MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
    375     lpwsz[iWLen] = L'';
    376     wstring wsResult(lpwsz);
    377     delete []lpwsz;
    378     return wsResult;
    379 }
    380 string Preprocess::myWideCharToMultibyte(wstring wsResult)
    381 {
    382     string sResult;
    383     int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。(包含字符串结束符)
    384     char *lpsz= new char[iLen];
    385     WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。
    386     sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。
    387     delete []lpsz;
    388     return sResult;
    389  
    390 }
    391 string Preprocess::ICTsplit(const char *sInput)
    392 {
    393     if(!ICTCLAS_Init())
    394     {
    395         printf("ICTCLAS INIT FAILED!
    ");
    396         string strerr(sInput);
    397         return strerr;
    398     }
    399     ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
    400     //导入用户词典后
    401     
    402  
    403     const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);
    404     string strresult(sResult);
    405     //printf("%s
    ", sResult);
    406     //把字符串转化成宽字符串
    407     wstring wsResult=myMultibyteToWideChar(strresult);
    408     boost::wregex wreg(L"\s+");
    409     wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));
    410     strresult=myWideCharToMultibyte(wsResult);
    411  
    412  
    413  
    414     //ofile<<str1;
    415     //ofile.close();
    416     //cout<<str1<<endl;
    417     //ICTCLAS_FileProcess("text.txt","test_result.txt",1);
    418     ICTCLAS_Exit();
    419  
    420     return strresult;
    421 }
    422 vectorPreprocess::goodWordsinPieceArticle(string rawtext,set stopwords)
    423 {
    424     vector goodWordstemp;
    425     vector goodWords;
    426     const char* sInput=rawtext.c_str();
    427     string sResult=ICTsplit(sInput);
    428     wstring wsResult=myMultibyteToWideChar(sResult);
    429     boost::wregex wreg(L"\d+");//去掉中文空格
    430     wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));
    431     //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
    432     boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));
    433  
    434     for(vector::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)
    435     {
    436         string temp=myWideCharToMultibyte(*it);
    437         trim(temp," ");
    438         if(!stopwords.count(temp)&&!temp.empty())
    439         {
    440             goodWords.push_back(temp);
    441         }
    442  
    443  
    444     }
    445  
    446     return goodWords;
    447 }
    448 void Preprocess::DFcharicteristicWordSelection(mapint,int>>> &mymap,int DFthreshold)
    449 {
    450     int finalKeyWordsCount=0;//计算共取了多少个关键词
    451     vectorint> >tempvector;
    452     for(mapint,int>>>::iterator it=mymap.begin();it!=mymap.end();++it)
    453     {
    454         tempvector.push_back(make_pair(it->first,(it->second).size()));
    455     }
    456  
    457     stable_sort(tempvector.begin(),tempvector.end(),isLonger);
    458     ofstream outfile(featurewordsAddress);
    459     for(vectorint> >::iterator it=tempvector.begin();it!=tempvector.end();it++)
    460     {  
    461         if(it->second>=DFthreshold)
    462         {
    463             //outfile<<it->first<<" "<<it->second<<endl;
    464             outfile<<it->first<<endl;
    465             finalKeyWordsCount++;
    466  
    467         }
    468  
    469     }
    470     outfile.close();
    471     cout<<"最后共选择特征词"<<finalKeyWordsCount<<endl;
    472     cout<<"by the way,DFthreshold equals"<<DFthreshold<<endl;
    473  
    474 }
    475 vectorPreprocess::GetFinalKeyWords()
    476 {
    477     vectormyKeys;
    478     ifstream infile(featurewordsAddress);
    479     while(!infile.eof())
    480     {
    481         string temp;
    482         infile>>temp;
    483         if(temp!="")
    484         {
    485             myKeys.push_back(temp);
    486         }
    487  
    488  
    489     }
    490     return myKeys;
    491 }
    492 vectorint,int> >Preprocess::GetfinalKeysMaxTFDF(mapint,int>>> &mymap)
    493 {
    494     vectorint,int> >maxTFandDF;
    495     vectormyKeys=GetFinalKeyWords();
    496     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
    497     { 
    498         int DF=mymap[*it].size();
    499         int maxTF=0;
    500         for(vectorint,int> >::iterator subit=mymap[*it].begin();subit!=mymap[*it].end();subit++)
    501         {
    502             if(subit->second>maxTF)
    503             {
    504                 maxTF=subit->second;
    505             }
    506  
    507         }
    508         maxTFandDF.push_back(make_pair(maxTF,DF));
    509         //find_if(mymap[*it].begin(),mymap[*it].end(),
    510     }
    511     return maxTFandDF;
    512 }
    513 vectorint,double> >Preprocess::NormalizationVSM(vectorint,double> > tempVSM)
    514 {
    515  
    516     double sum=0;
    517     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    518     {
    519         sum+=pow(vsmit->second,2);
    520     }
    521     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    522     {
    523         vsmit->second/=sqrt(sum);
    524     }
    525     return tempVSM;
    526  
    527 }
    528 string Preprocess::FormatVSMtoString(vectorint,double> > tempVSM)
    529 {
    530     string ret="{";
    531     int commaindication=0;
    532     for(vectorint,double> >::iterator vsmit=tempVSM.begin();vsmit!=tempVSM.end();++vsmit)
    533     {  
    534  
    535         ret+=do_fraction(vsmit->first)+" "+do_fraction(vsmit->second,8);
    536         if(commaindication
    537         {
    538             ret+=",";
    539         }
    540         commaindication++;
    541     }
    542     ret+="}";
    543     return ret;
    544 }
    545 void Preprocess::WriteHeadArff()
    546 {
    547     ofstream ofile(arffFileAddress,ios::binary);
    548     ofile<<"@relation aticle"<<endl;
    549     ofile<<"
    ";
    550     vector myKeys=GetFinalKeyWords();
    551     for(vector::iterator it=myKeys.begin();it!=myKeys.end();it++)
    552     {
    553         //string temp="@attribute "+"'"+(*it)+"'"+" real";
    554         string temp="";
    555         temp+="@attribute ";
    556         temp+="'";
    557         temp+=*(it);
    558         temp+="'";
    559         temp+=" real";
    560         
    561  
    562         ofile<<temp<<endl;
    563     }
    564     ofile<<"
    "<<endl;
    565     ofile<<"@data"<<endl;
    566     ofile.close();
    567 }
    568 void Preprocess::VSMFormation(mapint,int>>> &mymap)
    569 {   int corpus_N=endIndex-beginIndex+1;
    570     ofstream ofile1(articleIdsAddress,ios::binary);//保存文章编号的文件
    571     ofstream ofile2(arffFileAddress,ios::binary|ios::app);
    572  
    573     vector myKeys=GetFinalKeyWords();
    574     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
    575     for(int i=beginIndex;i<=endIndex;i++)
    576     {   vectorint,double> >tempVSM;
    577         for(vector::size_type j=0;j
    578         {
    579         //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    580             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    581  
    582  
    583             TF=0.5+0.5*(double)TF/(maxTFandDF[j].first);
    584             TF*=log((double)corpus_N/maxTFandDF[j].second);
    585             if(TF!=0)
    586             {
    587                 tempVSM.push_back(make_pair(j,TF));
    588  
    589             }
    590  
    591  
    592  
    593         }
    594         if(!tempVSM.empty())
    595         {
    596             tempVSM=NormalizationVSM(tempVSM);
    597             string vsmStr=FormatVSMtoString(tempVSM);
    598             ofile1<<i<<endl;
    599             ofile2<<vsmStr<<endl;
    600         }
    601         tempVSM.clear();
    602  
    603  
    604  
    605     }
    606     ofile1.close();
    607     ofile2.close();
    608  
    609  
    610 }
    611 void Preprocess::WriteTotalArff(char *dbfield,int DFthreshold,bool isbagOfWordsExist,FUNCSEG seg)
    612 {
    613      
    614      
    615     mapint,int>>> mymap;
    616     if(!isbagOfWordsExist)
    617     {
    618         ConstructMap(mymap,dbfield,seg);
    619         save(mymap);
    620         cout<<"词袋子信息已经保存到硬盘"<<endl;
    621     }
    622     else
    623     {
    624         load(mymap);
    625     }
    626     DFcharicteristicWordSelection(mymap,DFthreshold);
    627     WriteHeadArff();
    628     VSMFormation(mymap);
    629     cout<<"arff文件已经形成"<<endl;
    630      
    631      
    632     string temp(infoFromWekaAddress);
    633  
    634     cout<<"请您将使用weka聚类,并保存为"<<temp<<endl;
    635 }
    636 map<</code>int,vector<</code>double> > Preprocess::VSMConstruction(mapint,int>>> &mymap)
    637 {  
    638     int corpus_N=endIndex-beginIndex+1;
    639     map<</code>int,vector<</code>double>> vsmMatrix;
    640     vector myKeys=GetFinalKeyWords();
    641     vectorint,int> >maxTFandDF=GetfinalKeysMaxTFDF(mymap);
    642     for(int i=beginIndex;i<=endIndex;i++)
    643     {  
    644         vectorint,double> >tempVSM;
    645         for(vector::size_type j=0;j
    646         {
    647             //vector >::iterator findit=find_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    648             double TF=(double)count_if(mymap[myKeys[j]].begin(),mymap[myKeys[j]].end(),PredTFclass(i));
    649             TF=0.5+(double)TF/(maxTFandDF[j].first);
    650             TF*=log((double)corpus_N/maxTFandDF[j].second);
    651             tempVSM.push_back(make_pair(j,TF));
    652  
    653         }
    654         if(!tempVSM.empty())
    655         {
    656             tempVSM=NormalizationVSM(tempVSM);
    657             for(vectorint,double> >::iterator it=tempVSM.begin();it!=tempVSM.end();it++)
    658             {
    659                 vsmMatrix[i].push_back(it->second);
    660             }
    661  
    662  
    663  
    664         }
    665         tempVSM.clear();
    666  
    667  
    668  
    669     }
    670     return vsmMatrix;
    671  
    672 }
    673 map<</code>double> > Preprocess::GetClusters()
    674 {
    675  
    676     map<</code>double> >clusters;
    677     ifstream ifile(infoFromWekaAddress);
    678     string temp;
    679     while(getline(ifile,temp))
    680     {   boost::smatch matchcluster;
    681     boost::regex regcluster("Cluster\s+\d+",boost::regex::icase);
    682     if(boost::regex_search(temp,matchcluster,regcluster))  
    683     {  
    684         string clustertmp=matchcluster[0].str();
    685         string ordinates="";
    686         getline(ifile,ordinates);
    687         boost::regex regordinates("\d+(\.\d{1,4})?");
    688         boost::smatch matchordinates;
    689         std::string::const_iterator it=ordinates.begin(); 
    690         std::string::const_iterator end=ordinates.end();
    691         while (boost::regex_search(it,end,matchordinates,regordinates))
    692         {      
    693             string digitstemp=matchordinates[0].str();
    694             double digitval=0.0;
    695             std::stringstream ss;
    696             ss<<digitstemp;
    697             ss>>digitval;
    698             clusters[clustertmp].push_back(digitval);
    699             it=matchordinates[0].second;
    700         }
    701  
    702  
    703  
    704  
    705  
    706     }
    707     }
    708     return clusters;
    709 }
    710 double Preprocess::CalDotProductOfVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
    711 {
    712     double result = 0.0f;
    713     for (int i = 0; i < vector1.size(); i++)
    714         result += vector1[i] * vector2[i];
    715     return result;
    716 }
    717 double Preprocess::CalCosineofVectors(const vector<</code>double>&vector1,const vector<</code>double>&vector2)
    718 {
    719     double numerator=CalDotProductOfVectors(vector1,vector2);
    720     double denominator=CalDotProductOfVectors(vector1,vector1)*CalDotProductOfVectors(vector2,vector2);
    721     denominator=sqrt(denominator);
    722     return numerator/denominator;
    723 }
    724 vectorint,string> > Preprocess::GenerateClusterInfo(map<</code>int,vector<</code>double> >&vsmMatrix, map<</code>double> >&clusters)
    725 {
    726     vectorint,string> >resultInfo;
    727     for(map<</code>int,vector<</code>double> >::iterator it=vsmMatrix.begin();it!=vsmMatrix.end();it++)
    728     {
    729         vectordouble> >clusterDistanceAist;
    730         for(map<</code>double> >::iterator clusterit=clusters.begin();clusterit!=clusters.end();clusterit++)
    731         {
    732  
    733             double temp=CalCosineofVectors(it->second,clusterit->second);
    734             clusterDistanceAist.push_back(make_pair(clusterit->first,temp));
    735  
    736         }
    737         sort(clusterDistanceAist.begin(),clusterDistanceAist.end(),myCmp);
    738         vectordouble> >::iterator cDAit=clusterDistanceAist.begin();
    739  
    740         resultInfo.push_back(make_pair(it->first,cDAit->first));
    741         clusterDistanceAist.clear();
    742     }
    743     return  resultInfo;
    744  
    745 }
    746 map<</code>int> > Preprocess::FetchArticlesOFClusters(map<</code>double> >&clusters,vectorint,string>>&resultInfo)
    747 {
    748     map<</code>int>> articlesInfo;
    749  
    750     for(vectorint,string>>::iterator retit=resultInfo.begin();retit!=resultInfo.end();retit++)
    751     {
    752         for(map<</code>double> >::iterator it=clusters.begin();it!=clusters.end();it++)
    753         {
    754             if(retit->second==it->first)
    755             {
    756                 articlesInfo[it->first].push_back(retit->first);
    757             }
    758         }
    759     }
    760  
    761  
    762  
    763  
    764  
    765     return articlesInfo;
    766  
    767  
    768 }
    769 void Preprocess::RetreiveArticleInfoFromDataBase()
    770 {
    771     mapint,int>>> mymap;
    772     vectorint,string>>resultInfo;
    773     map<</code>double> >clusters;
    774     map<</code>int,vector<</code>double> >vsmMatrix;
    775     map<</code>int>> articlesInfo;
    776     ofstream ofile("F:\cluster\ArticlesInPerCluster.txt");
    777     //boost::regex_replace(strresult)
    778     //ConstructMap(mymap,1,500);
    779     //save(mymap);
    780     load(mymap);
    781     vsmMatrix=VSMConstruction(mymap);
    782     clusters=GetClusters();
    783     resultInfo=GenerateClusterInfo(vsmMatrix,clusters);
    784     articlesInfo=FetchArticlesOFClusters(clusters,resultInfo);
    785  
    786     
    787     for(map<</code>int>>::iterator it=articlesInfo.begin();it!=articlesInfo.end();it++)
    788     {
    789         ostringstream out;
    790         string selectassist;
    791         char *selectsql=new char[5000];
    792         int count=0;
    793         CoInitialize(NULL);
    794         _ConnectionPtr pConn(__uuidof(Connection));
    795         _RecordsetPtr pRst(__uuidof(Recordset));
    796         pConn->ConnectionString=dbconnection;
    797         pConn->Open("","","",adConnectUnspecified);
    798         cout <<it->first<<endl;
    799         ofile<<it->first<<endl;
    800         out<<"(";
    801         count=0;
    802         for(int i=0;isecond.size();i++)
    803         {
    804             out<<(it->second)[i];
    805             if(countsecond.size()-1)
    806             {
    807                 out<<",";
    808             }
    809             count++;
    810              
    811          
    812         }
    813         out<<")";
    814         selectassist=out.str();
    815         sprintf_s(selectsql,5000,"%s %s","Select ArticleTitle,class from News Where ArticleId in ",selectassist.c_str());
    816  
    817         pRst=pConn->Execute(selectsql,NULL,adCmdText);
    818         while(!pRst->rsEOF)
    819         {  
    820         //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
    821             string title=(_bstr_t)pRst->GetCollect("ArticleTitle");
    822             //string rawtext=(_bstr_t)pRst->GetCollect("ArticleText");
    823             string categorization=(_bstr_t)pRst->GetCollect("class");
    824             cout<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
    825             ofile<<"文章标题:"<<title<<"文章所属类别: "<<categorization<<endl;
    826  
    827  
    828              
    829          
    830  
    831  
    832             pRst->MoveNext();
    833              
    834         }
    835         pRst->Close();
    836         pConn->Close();
    837         pRst.Release();
    838         pConn.Release();
    839         CoUninitialize();
    840      
    841     }
    842      
    843      
    844  
    845  
    846 ofile.close(); 
    847      
    848      
    849 }
    850 vectorPreprocess:: mySplit(string s,set stopwords)
    851 {
    852     vector wordCollection;
    853     trim(s," ");
    854  
    855     int nPosBegin=0;
    856     int nPosEnd=s.find(' ',nPosBegin);
    857     while(nPosEnd!=string::npos)
    858     {
    859         string temp=s.substr(nPosBegin,nPosEnd-nPosBegin);
    860         trim(temp," ");
    861         wordCollection.push_back(temp);
    862         nPosBegin=s.find_first_not_of(' ',nPosEnd);
    863         nPosEnd=s.find(' ',nPosBegin);
    864     }
    865     string temp=s.substr(nPosBegin,s.size()-nPosBegin);
    866     trim(temp," ");
    867     wordCollection.push_back(temp);
    868  
    869  
    870     return wordCollection;
    871  
    872 }
    873  
  • 相关阅读:
    BestCoder 1st Anniversary ($) 1002.Hidden String
    Oracle 复制随意表一行的SQL语句(測试Ok)
    华为招聘机试整理15:约瑟夫环
    302跳转
    CCNP路由实验之十 组播(多播)
    How to test Heat (by quqi99)
    用关联容器实现文本替换单词
    【C语言】04-函数
    WAS集群系列(5):集群搭建:步骤3:安装IHS软件
    [Apple开发者帐户帮助]七、注册设备(2)注册多个设备
  • 原文地址:https://www.cnblogs.com/wq920/p/3275882.html
Copyright © 2011-2022 走看看