实验框架图见libsvm文本分类:二分类(二) 实验框架图 下面是主模块代码,暂不公布全部代码
代码
# -*- coding: cp936 -*-
#coding gb2312
from SVM import FoldersCreation
import os
##############################################################################################
#参数设计
N=100 #N: half of total corpus size
vfold=5 #vfold: 循环验证的次数
featureDimension=2000 #featureDimension:VSM模型特征维度
toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算
count_done_research_times=0 #已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
#featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
##############创建文件夹########################################################################
os.mkdir(r'D:\TextCategorization')
FoldersCreation.CreateAssist()
print '创建文件夹模块运行结束'
print '***************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
################处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from SVM import CorpusPartition
CorpusPartition.MoveCorpus(N)
CorpusPartition.moveAccordingPartition(N,count_done_research_times)
print '分割文本集模块运行结束'
print '*******************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#########################文档集合分词##########################################################
from SVM import DataManager
from ctypes import *
import os
import cPickle as p
import re
roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing']
rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
#root=r'D:\TextCategorization\training'
#rootfinal=r'D:\TextCategorization\segmented'
for i in range(0,2):
dm=DataManager.DataManager(roots[i])
subdir=dm.GetSubDir()
filepathstotalsrc=[]
for sub in subdir:
dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub)
filepaths=dm.GetFilePaths()
filepathsassist=[sub+os.sep+path for path in filepaths ]
filepathstotalsrc=filepathstotalsrc+filepathsassist
for path in filepathstotalsrc:
myfile=file(roots[i]+os.sep+path)
s=myfile.read()
myfile.close()
dll=cdll.LoadLibrary("ICTCLAS30.dll")
dll.ICTCLAS_Init(c_char_p("."))
bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
segmented=c_char_p(bSuccess).value
segmentedtmp=re.sub("\s+",'|',segmented,0)
segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp)
fid=file(rootfinals[i]+os.sep+path,'w')
fid.write(segmentedfinal)
fid.close()
dll.ICTCLAS_Exit()
#print 'finalfinish congratulations!'
print '文档集分词模块运行结束'
print '**********************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
##################建立词袋子模型######################################################################
from SVM import BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented')
print '建立词袋子模型模块运行结束'
print '***********************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################特征词选择##################################################################
from SVM import FeatureSelectionModel
featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature
import cPickle as mypickle
fid=file(r'D:\TextCategorization\VITData\keywords.dat','w')
mypickle.dump(featurewords,fid)
fid.close()
print '特征词选择模块运行结束'
print '*******************************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################文档向量模型建立模块##############################################################
from SVM import VSMformation
root1=r'D:\TextCategorization\segmented'
root2=r'D:\TextCategorization\tsegmented'
print 'begin.....'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1)
print '训练语料库转化完毕'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2)
print '测试语料库转化完毕'
print '文档向量模型建立模块运行结束'
print '批处理完毕,congratulations!'
#coding gb2312
from SVM import FoldersCreation
import os
##############################################################################################
#参数设计
N=100 #N: half of total corpus size
vfold=5 #vfold: 循环验证的次数
featureDimension=2000 #featureDimension:VSM模型特征维度
toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算
count_done_research_times=0 #已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
#featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
##############创建文件夹########################################################################
os.mkdir(r'D:\TextCategorization')
FoldersCreation.CreateAssist()
print '创建文件夹模块运行结束'
print '***************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
################处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
from SVM import CorpusPartition
CorpusPartition.MoveCorpus(N)
CorpusPartition.moveAccordingPartition(N,count_done_research_times)
print '分割文本集模块运行结束'
print '*******************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#########################文档集合分词##########################################################
from SVM import DataManager
from ctypes import *
import os
import cPickle as p
import re
roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing']
rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
#root=r'D:\TextCategorization\training'
#rootfinal=r'D:\TextCategorization\segmented'
for i in range(0,2):
dm=DataManager.DataManager(roots[i])
subdir=dm.GetSubDir()
filepathstotalsrc=[]
for sub in subdir:
dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub)
filepaths=dm.GetFilePaths()
filepathsassist=[sub+os.sep+path for path in filepaths ]
filepathstotalsrc=filepathstotalsrc+filepathsassist
for path in filepathstotalsrc:
myfile=file(roots[i]+os.sep+path)
s=myfile.read()
myfile.close()
dll=cdll.LoadLibrary("ICTCLAS30.dll")
dll.ICTCLAS_Init(c_char_p("."))
bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
segmented=c_char_p(bSuccess).value
segmentedtmp=re.sub("\s+",'|',segmented,0)
segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp)
fid=file(rootfinals[i]+os.sep+path,'w')
fid.write(segmentedfinal)
fid.close()
dll.ICTCLAS_Exit()
#print 'finalfinish congratulations!'
print '文档集分词模块运行结束'
print '**********************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
##################建立词袋子模型######################################################################
from SVM import BagOfWordsConstruction
BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented')
print '建立词袋子模型模块运行结束'
print '***********************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################特征词选择##################################################################
from SVM import FeatureSelectionModel
featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature
import cPickle as mypickle
fid=file(r'D:\TextCategorization\VITData\keywords.dat','w')
mypickle.dump(featurewords,fid)
fid.close()
print '特征词选择模块运行结束'
print '*******************************************************************************************'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#######################文档向量模型建立模块##############################################################
from SVM import VSMformation
root1=r'D:\TextCategorization\segmented'
root2=r'D:\TextCategorization\tsegmented'
print 'begin.....'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1)
print '训练语料库转化完毕'
VSMformation.LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2)
print '测试语料库转化完毕'
print '文档向量模型建立模块运行结束'
print '批处理完毕,congratulations!'