zoukankan      html  css  js  c++  java
  • 我的第一个C++程序,还像个C++c程序的样子吧

    目的:从数据库中抽取文章关键词,并统计这些关键词在哪些文章中出现,出现多少次。(算是词袋子模型吧),然后对每篇文章形成形成VSM模型,写成weka的数据格式,然后调用weka对文章聚类。

    目前“形成此代码模型一块已经完毕”

    其中词袋子的数据结构如下:

    map<string,vector<pair<int,int>>>&mymap),

    目前已经完成此部分的serilize(save/load)以及print 功能

    #include "stdafx.h"
    #include<iostream>
    #include<map>
    #include<vector>
    #include<string>
    #include<iomanip>
    #include<fstream>
    //#include<boost/tokenizer.hpp>
    using namespace std;

    形成词袋子模型
    nt ConstructMap(map<string,vector<pair<int,int>>>&mymap)
    {
        
        vector
    <string> mySplit(string s);
        CoInitialize(NULL);
        _ConnectionPtr pConn(__uuidof(Connection));
        _RecordsetPtr pRst(__uuidof(Recordset));
        pConn
    ->ConnectionString="Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";
        pConn
    ->Open("","","",adConnectUnspecified);
        pRst
    =pConn->Execute("select CKeyWord,ArticleId from Article order by ArticleId",NULL,adCmdText);
        
    while(!pRst->rsEOF)
        {    vector
    <string>wordcollection;
            
    string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
            
    if(keywordstr!="")
            {
                    wordcollection
    =mySplit(keywordstr);
                    
    string tempid=(_bstr_t)pRst->GetCollect("ArticleId");
                    
    int articleid=atoi(tempid.c_str());
                    
    for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)
                    {
                        vector
    <pair<int,int>>::iterator it;
                        
    if(mymap[*strit].empty())
                        {
                            pair
    <int,int>mytemppair=make_pair(articleid,1);
                            mymap[
    *strit].push_back(mytemppair);

                        }
                        
    else
                        {
                            
    for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)
                            {  
                                
    if(it->first==articleid)
                                {
                                    it
    ->second=++(it->second);
                                    
    break;
                                }
                        
                            }
                            
    if(it==mymap[*strit].end())
                            {
                                pair
    <int,int>mytemppair=make_pair(articleid,1);
                                mymap[
    *strit].push_back(mytemppair);
                            }

                        }

                }
                

            }
            
            
            pRst
    ->MoveNext();
            wordcollection.clear();
        }
        pRst
    ->Close();
        pConn
    ->Close();
        pRst.Release();
        pConn.Release();
        CoUninitialize();
        
    return 0;

    }
    加载词袋子模型
    void load(map<string,vector<pair<int,int> > >&mymap)
    {
        ifstream infile(
    "c:\\mydict.dat",ios::binary);
        
    int lenMyMap;//保存词典长度
        int lenVector;//保存每个词出现的文章数目
        string key;//保存读出的map的键值
        int articleId;//文章标号
        int count;//在该文章中刚出现的数目
        string comma;
        
    string semicolon;
        
        infile
    >>lenMyMap;
        
    while(!infile.eof())
        {
            infile
    >>key;
            infile
    >>lenVector;
            vector
    <pair<int,int> >temp;
            
    for (int i=0;i<lenVector;i++)
            {
                infile
    >>articleId>>count>>semicolon;
                temp.push_back(make_pair(articleId,count));
            }
            mymap[key]
    =temp;
            
            
        }
        

        infile.close();

    }
    保存词袋子模型
    void save(map<string,vector<pair<int,int> > >&mymap)
    {   ofstream outfile(
    "c:\\mydict.dat",ios::binary);
        outfile
    <<mymap.size()<<endl;
        map
    <string,vector<pair<int,int> > >::iterator it;
        
    for (it=mymap.begin();it!=mymap.end();it++)
        {   outfile
    <<it->first<<endl;
            vector
    <pair<int,int>>::iterator subit;
            outfile
    <<it->second.size()<<endl;
            
    for(subit=(it->second).begin();subit!=(it->second).end();++subit)
            {
                outfile
    <<subit->first<<" "<<subit->second<<" "<<";"<<" ";
            }
            outfile
    <<endl;
        }
        
    //outfile.write((char *)&mymap,sizeof(mymap));

        outfile.close();
    }
    打印词袋子模型
    void print(map<string,vector<pair<int,int> > >&mymap)
    {   
        cout
    <<mymap.size()<<endl;
        map
    <string,vector<pair<int,int> > >::iterator it;
        
    for (it=mymap.begin();it!=mymap.end();it++)
        {   cout
    <<it->first<<endl;
            vector
    <pair<int,int>>::iterator subit;
            cout
    <<it->second.size()<<endl;
            
    for(subit=(it->second).begin();subit!=(it->second).end();++subit)
            {
                cout
    <<subit->first<<','<<subit->second<<";";
            }
            cout
    <<endl;
        }
        
    }
  • 相关阅读:
    badblocks 检查硬盘是否有坏道
    IE兼容性开发的笔记
    Linux下设置ip和主机名进行绑定
    netty httpserver
    netty websocket协议开发
    OAuth2.0和SSO授权的区别
    window.location.href跳转问题2
    修改密码,验证两次输入是否相同,相同才能提交
    (2)集合 遍历set集合
    (1)集合 ---遍历map集合
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1808300.html
Copyright © 2011-2022 走看看