zoukankan      html  css  js  c++  java
  • C++处理reuters21578(二)

    作者finallyliuyu 出处博客园

     通过C++处理reuters21578(一)的代码,初步形成了两张表单存放训练语料库和测试语料库,由于这两个语料库的个别类别不一致,所以要找到这两个语料库类别的交集,然后最终形成文本分类的训练语料库和测试语料库。以下主函数中完成此功能。

    class GT_clss
     {
     
    public:
         GT_clss(
    string &s):comparepart(s){}
         
    bool operator()(const string &elem)
         {
             
    return elem==comparepart;

         }
     
    private:
         
    string comparepart;
     };
    数据库中共有多少个类别
    vector<string>GetLabels(string tablename)
     {   vector
    <string>labels;
     
    char * selectbySpecificId=new char [1000];
     memset(selectbySpecificId,
    0,1000);
     sprintf_s(selectbySpecificId,
    1000,"select Categorization from %s ",tablename.c_str());
     CoInitialize(NULL);
     _ConnectionPtr pConn(__uuidof(Connection));
     _RecordsetPtr pRst(__uuidof(Recordset));
     pConn
    ->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
     pConn
    ->Open("","","",adConnectUnspecified);
     pRst
    =pConn->Execute(selectbySpecificId,NULL,adCmdText);
     
    while(!pRst->rsEOF)
     {
         
    string label=(_bstr_t)pRst->GetCollect("Categorization");
         
    if (!count_if(labels.begin(),labels.end(),GT_clss(label)))
         {
             labels.push_back(label);
         }

         pRst
    ->MoveNext();

     }
     pRst
    ->Close();
     pConn
    ->Close();
     pRst.Release();
     pConn.Release();
     CoUninitialize();
     delete []selectbySpecificId;

     
    return labels;



     }
    主函数
    int _tmain(int argc, _TCHAR* argv[])
    {
         
    int end;
        
    //set<string>labels;
        vector<string>labelsTrain=GetLabels("ReteursTrain");
        vector
    <string>labelsTest=GetLabels("ReteursTest");
        vector
    <string>finalLabels;
        
    for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
        {
            trim(
    *it," ");
        }
        
    for(vector<string>::iterator it=labelsTest.begin();it!=labelsTest.end();it++)
        {
            trim(
    *it," ");

        }
        
        
    for (vector<string>::iterator it=labelsTrain.begin();it!=labelsTrain.end();it++)
        {
            
    if (count_if(labelsTest.begin(),labelsTest.end(),GT_clss(*it)))
            {
                finalLabels.push_back(
    *it);
            }
        }

        
    char * selectbySpecificId=new char [1000];
        memset(selectbySpecificId,
    0,1000);
        sprintf_s(selectbySpecificId,
    1000,"select CArticleName,CAbstract,Categorization from ReteursTest");
        CoInitialize(NULL);
        _ConnectionPtr pConn(__uuidof(Connection));
        _RecordsetPtr pRst(__uuidof(Recordset));
        _ConnectionPtr pConn2(__uuidof(Connection));
        pConn
    ->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo";
        pConn2
    ->ConnectionString="Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=FinallyCorpus";
        pConn
    ->Open("","","",adConnectUnspecified);
        pConn2
    ->Open("","","",adConnectUnspecified);
        pRst
    =pConn->Execute(selectbySpecificId,NULL,adCmdText);
        
    while(!pRst->rsEOF)
        {
            
    string label=(_bstr_t)pRst->GetCollect("Categorization");
            trim(label,
    " ");

            
    if (count_if(finalLabels.begin(),finalLabels.end(),GT_clss(label)))
            {
                
    string ArticleTitle=(_bstr_t)pRst->GetCollect("CArticleName");
                
    string ArticleText=(_bstr_t)pRst->GetCollect("CAbstract");
                ArticleTitle
    =ProcessforMSSQL(ArticleTitle);
                ArticleText
    =ProcessforMSSQL(ArticleText);
                
    char *sqlInsert=new char[1000000];
                _variant_t RecordsAffected;
                memset(sqlInsert,
    0,1000000);
                sprintf_s(sqlInsert,
    1000000,"insert into ReteursTestingCorpus(CArticleName,CAbstract,Categorization) values('%s','%s','%s')",ArticleTitle.c_str(),ArticleText.c_str(),label.c_str());
                pConn2
    ->Execute(sqlInsert,&RecordsAffected,-1);
                delete []sqlInsert;

                


                
            }
            

            pRst
    ->MoveNext();

        }
        pRst
    ->Close();
        pConn
    ->Close();
        pRst.Release();
        pConn.Release();
        pConn2
    ->Close();
        pConn2.Release();
        CoUninitialize();
        delete []selectbySpecificId;

        
        cout
    <<"两标签集交集为"<<endl;

        cout
    <<finalLabels.size()<<endl;

        
    //DictionaryToDataBase();
        
        
    //FindFile(L"E:\\新闻语料\\reuters21578");
        

        
    //pRst=pConn->Execute(,NULL,adCmdText);


        
       cout
    <<"finish"<<endl;
        
        
        cin
    >>end;






    }
  • 相关阅读:
    redis集群redis-cloud搭建
    Linux下搭建redis服务器
    mybatis主键返回
    分布式文件系统FastDFS
    mapper映射文件不发布
    dubbo发布和引用服务
    PageHelper分页插件
    F. Cards and Joy
    E. Paint the Tree 树形dp
    D. Sequence Sorting dp
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1918042.html
Copyright © 2011-2022 走看看