zoukankan      html  css  js  c++  java
  • 借助weka实现的分类器进行针对文本分类问题的特征词选择实验(实验代码备份)

    主函数头文件

    View Code
    1 include "stdafx.h"
    2 #include"Preprocess.h"
    3 #include"common.h"
    4 #include "CorpusProcess.h"
    5 #include "LibSvmClassifier.h"

    第一部分:

    建立词典和关联表

     1 Preprocess::FUNCSEG seg=&Preprocess::goodWordsinPieceArticle;
     2     int beginIndex=1;
     3     int endIndex=6950;
     4     Preprocess p(beginIndex,endIndex);
     5     DICTIONARY mymap;
     6     CONTINGENCY contigencyTable;
     7     FeatureWeight mymapweight;
     8     DOCMATRIX_1 trainingSet;
     9     DOCMATRIX_1 testingSet;
    10     vector<string>labels;
    11     string testCorpusTable="ReteursTestingCorpus";
    12     string trainCorpusTable="ReteursTrainingCorpus";
    13     char*dictaddress="D:\\ReteursForWeka\\dict.dat";
    14     char*contigencyaddress="D:\\ReteursForWeka\\contigency.dat";
    15     labels=p.GetLabels(testCorpusTable);
    16     p.ConstructDictionary(mymap,seg,trainCorpusTable);
    17     cout<<"finish construct dictionary"<<endl;
    18     p.SaveDictionary(mymap,dictaddress);
    19     cout<<"finish save dictionary"<<endl;
    20     p.LoadDictionary(mymap,dictaddress);
    21     cout<<"finish load dictionary"<<endl;
    22     p.GetContingencyTable(mymap,labels,contigencyTable,trainCorpusTable);
    23     cout<<"finish construct contigencytable"<<endl;
    24     p.SaveContingencyTable(contigencyTable,contigencyaddress);
    25     cout<<"finish save contigencytable"<<endl;
    26     p.LoadContingencyTable(contigencyTable,contigencyaddress);
    27     cout<<"finish loadcontigencytable"<<endl;

    第二部分:

    遴选特征词,形成VSM模型,形成arff数据格式

     1 char* dest="D:\\ReteursForWeka\\chi\\";
     2     int featuredimension[10]={50,100,200,300,400,500,1000,3000,5000,8000};
     3     char *weightaddress="D:\\ReteursForWeka\\chi\\wordsweight.dat";
     4     char *keywordaddress=new char[1000];
     5     char *trainvsmaddress=new char[1000]; 
     6     char *testvsmaddress=new char[1000];
     7     p.LoadDictionary(mymap,dictaddress);
     8     p.LoadContingencyTable(contigencyTable,contigencyaddress);
     9     p.InformationGainFeatureSelection(labels,mymap,mymapweight,contigencyTable,weightaddress);
    10 
    11 
    12     for (int i=0;i<10;i++)
    13     {
    14 
    15         memset(keywordaddress,0,1000);
    16         memset(trainvsmaddress,0,1000);
    17         memset(testvsmaddress,0,1000);
    18 
    19         sprintf_s(keywordaddress,1000,"%s%skeywords.dat",dest,p.do_fraction(featuredimension[i]).c_str());
    20         sprintf_s(trainvsmaddress,1000,"%s%strainCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
    21         sprintf_s(testvsmaddress,1000,"%s%stestCorpus.arff",dest,p.do_fraction(featuredimension[i]).c_str());
    22         p.FeatureSelectionFactory(labels,mymapweight,weightaddress,keywordaddress,featuredimension[i],true,trainCorpusTable);
    23 
    24         cout<<keywordaddress<<"finish"<<endl;
    25         p.WriteHeadArff(testvsmaddress,keywordaddress,labels);
    26         p.GetManyVSM(1,2676,testCorpusTable,mymap,testingSet,keywordaddress);
    27         p.WriteDataBodyArff(testingSet,testCorpusTable,testvsmaddress,featuredimension[i]);
    28         testingSet.clear();
    29         cout<<testvsmaddress<<"finish"<<endl;
    30         p.WriteHeadArff(trainvsmaddress,keywordaddress,labels);
    31         p.VSMConstruction(mymap,trainingSet,keywordaddress);
    32         p.WriteDataBodyArff(trainingSet,trainCorpusTable,trainvsmaddress,featuredimension[i]);
    33         trainingSet.clear();
    34         cout<<trainvsmaddress<<"finish"<<endl;
    35 
    36     }
    37 
    38     delete []keywordaddress;
    39     delete []trainvsmaddress;
    40     delete []testvsmaddress;
    41     
    42     cout<<"finish"<<endl;
    43     int end;
    44     cin>>end;
    45     return 0;
  • 相关阅读:
    .net从网络接口地址获取json,然后解析成对象(一)
    .net获取本地ip地址
    .net上传文件,利用npoi读取文件信息到datatable里
    .net利用NPOI生成excel文件
    .NET获取城市信息(将三字代码转换成城市名)
    JS下拉页面时一个横幅的样式和js
    整数中1出现的次数(1~n)
    连续子数组的最大和
    最小的K个数
    数组中出现次数超过一半的数字
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1996252.html
Copyright © 2011-2022 走看看