zoukankan      html  css  js  c++  java
  • 借助weka实现的分类器进行针对文本分类问题的特征词选择实验(实验代码备份)

    主函数头文件

    View Code
    1 include "stdafx.h"
    2 #include"Preprocess.h"
    3 #include"common.h"
    4 #include "CorpusProcess.h"
    5 #include "LibSvmClassifier.h"

    第一部分:

    建立词典和关联表

     1 Preprocess::FUNCSEG seg=&Preprocess::goodWordsinPieceArticle;
     2     int beginIndex=1;
     3     int endIndex=6950;
     4     Preprocess p(beginIndex,endIndex);
     5     DICTIONARY mymap;
     6     CONTINGENCY contigencyTable;
     7     FeatureWeight mymapweight;
     8     DOCMATRIX_1 trainingSet;
     9     DOCMATRIX_1 testingSet;
    10     vector<string>labels;
    11     string testCorpusTable="ReteursTestingCorpus";
    12     string trainCorpusTable="ReteursTrainingCorpus";
    13     char*dictaddress="D:\\ReteursForWeka\\dict.dat";
    14     char*contigencyaddress="D:\\ReteursForWeka\\contigency.dat";
    15     labels=p.GetLabels(testCorpusTable);
    16     p.ConstructDictionary(mymap,seg,trainCorpusTable);
    17     cout<<"finish construct dictionary"<<endl;
    18     p.SaveDictionary(mymap,dictaddress);
    19     cout<<"finish save dictionary"<<endl;
    20     p.LoadDictionary(mymap,dictaddress);
    21     cout<<"finish load dictionary"<<endl;
    22     p.GetContingencyTable(mymap,labels,contigencyTable,trainCorpusTable);
    23     cout<<"finish construct contigencytable"<<endl;
    24     p.SaveContingencyTable(contigencyTable,contigencyaddress);
    25     cout<<"finish save contigencytable"<<endl;
    26     p.LoadContingencyTable(contigencyTable,contigencyaddress);
    27     cout<<"finish loadcontigencytable"<<endl;

    第二部分:

    遴选特征词,形成VSM模型,形成arff数据格式

     1 char* dest="D:\\ReteursForWeka\\chi\\";
     2     int featuredimension[10]={50,100,200,300,400,500,1000,3000,5000,8000};
     3     char *weightaddress="D:\\ReteursForWeka\\chi\\wordsweight.dat";
     4     char *keywordaddress=new char[1000];
     5     char *trainvsmaddress=new char[1000]; 
     6     char *testvsmaddress=new char[1000];
     7     p.LoadDictionary(mymap,dictaddress);
     8     p.LoadContingencyTable(contigencyTable,contigencyaddress);
     9     p.InformationGainFeatureSelection(labels,mymap,mymapweight,contigencyTable,weightaddress);
    10 
    11 
    12     for (int i=0;i<10;i++)
    13     {
    14 
    15         memset(keywordaddress,0,1000);
    16         memset(trainvsmaddress,0,1000);
    17         memset(testvsmaddress,0,1000);
    18 
    19         sprintf_s(keywordaddress,1000,"%s%skeywords.dat",dest,p.do_fraction(featuredimension[i]).c_str());
    20         sprintf_s(trainvsmaddress,1000,"%s%strainCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
    21         sprintf_s(testvsmaddress,1000,"%s%stestCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
    22         p.FeatureSelectionFactory(labels,mymapweight,weightaddress,keywordaddress,featuredimension[i],true,trainCorpusTable);
    23 
    24         cout<<keywordaddress<<"finish"<<endl;
    25         p.WriteHeadArff(testvsmaddress,keywordaddress,labels);
    26         p.GetManyVSM(1,2676,testCorpusTable,mymap,testingSet,keywordaddress);
    27         p.WriteDataBodyArff(testingSet,testCorpusTable,testvsmaddress,featuredimension[i]);
    28         testingSet.clear();
    29         cout<<testvsmaddress<<"finish"<<endl;
    30         p.WriteHeadArff(trainvsmaddress,keywordaddress,labels);
    31         p.VSMConstruction(mymap,trainingSet,keywordaddress);
    32         p.WriteDataBodyArff(trainingSet,trainCorpusTable,trainvsmaddress,featuredimension[i]);
    33         trainingSet.clear();
    34         cout<<trainvsmaddress<<"finish"<<endl;
    35 
    36     }
    37 
    38     delete []keywordaddress;
    39     delete []trainvsmaddress;
    40     delete []testvsmaddress;
    41     
    42     cout<<"finish"<<endl;
    43     int end;
    44     cin>>end;
    45     return 0;
  • 相关阅读:
    eclipse下c/cpp " undefined reference to " or "launch failed binary not found"问题
    blockdev 设置文件预读大小
    宝宝语录
    CentOS修改主机名(hostname)
    subprocess报No such file or directory
    用ldap方式访问AD域的的错误解释
    英特尔的VTd技术是什么?
    This virtual machine requires the VMware keyboard support driver which is not installed
    Linux内核的文件预读详细详解
    UNP总结 Chapter 26~29 线程、IP选项、原始套接字、数据链路访问
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1996252.html
Copyright © 2011-2022 走看看