zoukankan      html  css  js  c++  java
  • Python项目输出小类概率,机器学习

    from pandas import read_csv
    import numpy as np
    from sklearn.datasets.base import Bunch
    import pickle    #导入cPickle包并且取一个别名pickle #持久化类
    from sklearn.feature_extraction.text import TfidfVectorizer
    import jieba
    import xlwt
    import operator#排序用
    from sklearn import metrics
    
    
    Straindata=[]
    Strainlabel=[]
    Sart_train=[]
    
    Stestdata=[]
    Stestlabel=[]
    Sart_test=[]
    
    Slast=[]
    Snew=[]
    
         
    class obj:
        def __init__(self):
            self.key=0
            self.weight=0.0
    
    def importSmallContentdata(file,data,art,label,f):
        dataset=read_csv(file)
        Sdata = dataset.values[:,:]
        print(type(Sdata))
        
        if f==1:
            for line in Sdata:
                ls=[]
                ls.append(line[14])
                ls.append(line[15])
                ls.append(line[16])
                ls.append(line[17])
                Slast.append(ls)
            #print(len(Slast))
            #print("需要对照的小类数据准备完毕")
                
        '''找到smalli不为0的装入Straindata,把数据分开'''
        for smalli in range(14,18):
            #print(smalli)
            count=0
            for line in Sdata:
                count=count+1
                if line[smalli]!='0' and line[smalli]!=0 :
                    k=1
                    ls=[]
                    for i in line:
                        if k==1:
                            art.append(i)
                            k=k+1
                            continue
                        if k==11:#k为14并不代表是line[14],因为line是从0开始
                            break
                        ls.append(float(i))
                        k=k+1
                    data.append(ls)
                    label.append(line[smalli])
                    if f==1:
                        Snew.append(count)
                        
        #print("为什么都超限",len(Snew))
    
    def getKvector(train_set,vec,n):
        nonzero=train_set.tdm.nonzero()
        k=0
        lis=[]
        gather=[]
        p=-1
        for i in nonzero[0]:
            p=p+1
            if k==i:
                a=obj()
                a.key=nonzero[1][p]
                a.weight=train_set.tdm[i,nonzero[1][p]]
                lis.append(a)
            else:
                lis.sort(key=lambda obj: obj.weight, reverse=True)#对链表内为类对象的排序
                gather.append(lis)
                while k < i:
                    k=k+1
                lis=[]
                a=obj()
                a.key=nonzero[1][p]
                a.weight=train_set.tdm[i,nonzero[1][p]]
                lis.append(a)
        gather.append(lis)#gather存储的是每条数据的事实描述的特征向量,已经从小到大排好了,只不过每个存既有key又有weight
        
        #我们只要key,不再需要weight
    
        sj=1
        for i in gather:
            ls=[]
            for j in i:
                sj=sj+1
                ls.append(float(j.key))
            while sj<=n:
                sj=sj+1
                ls.append(-1)
            sj=1
            vec.append(ls)
    
    
                    
    '''读取停用词''' 
    def _readfile(path):  
        with open(path, "rb") as fp:  
            content = fp.read()  
        return content  
    
    ''' 读取bunch对象'''
    def _readbunchobj(path):  
        with open(path, "rb") as file_obj:  
            bunch = pickle.load(file_obj)  
        return bunch  
    
    '''写入bunch对象'''  
    def _writebunchobj(path, bunchobj):  
        with open(path, "wb") as file_obj:  
            pickle.dump(bunchobj, file_obj) 
    
    def buildtrainbunch(bunch_path,art_train,trainlabel):
        bunch = Bunch(label=[],contents=[]) 
        for item1 in trainlabel:
            bunch.label.append(item1)
    
        #trainContentdatasave=[] #存储所有训练和测试数据的分词  
        for item2 in art_train:
            item2=str(item2)
            item2 = item2.replace("
    ", "")
            item2 = item2.replace(" ", "")
            content_seg=jieba.cut(item2)
            save2=''
            for item3 in content_seg:
                if len(item3) > 1 and item3!='
    ':
                    #trainContentdatasave.append(item3)
                    save2=save2+","+item3
            bunch.contents.append(save2)
        with open(bunch_path, "wb") as file_obj:  
            pickle.dump(bunch, file_obj)  
        print("构建训练数据文本对象结束!!!")
    
    def buildtestbunch(bunch_path,art_test,testlabel):
        bunch = Bunch(label=[],contents=[]) 
        for item1 in testlabel:
            bunch.label.append(item1)
    
        #testContentdatasave=[] #存储所有训练和测试数据的分词   
        for item2 in art_test:
            item2=str(item2)
            item2 = item2.replace("
    ", "")
            item2 = item2.replace(" ", "")
            content_seg=jieba.cut(item2)
            save2=''
            for item3 in content_seg:
                if len(item3) > 1 and item3!='
    ':
                    #testContentdatasave.append(item3)
                    save2=save2+","+item3
            bunch.contents.append(save2)
        with open(bunch_path, "wb") as file_obj:  
            pickle.dump(bunch, file_obj)  
        print("构建测试数据文本对象结束!!!")
    def vector_space(stopword_path,bunch_path,space_path):
        
        stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词  
        bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象  
        #构建tf-idf词向量空间对象  
        tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})  
        
        #权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值
        
        #使用TfidVectorizer初始化向量空间模型
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=True,max_features=15000)
        #print(vectorizer)
        #文本转为词频矩阵,单独保存字典文件
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
        tfidfspace.vocabulary = vectorizer.vocabulary_ 
        #创建词袋的持久化
        _writebunchobj(space_path, tfidfspace)  
        print("if-idf词向量空间实例创建成功!!!")
    
    def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
        
        stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表  
        bunch = _readbunchobj(bunch_path)  
        tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) 
        #导入训练集的TF-IDF词向量空间  ★★
        trainbunch = _readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary  
        
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)  
        
        
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        _writebunchobj(space_path, tfidfspace)  
        print("if-idf词向量空间实例创建成功!!!")
      
                  
    if __name__=="__main__":  
        
        '''============================先导入数据=================================='''
        file_train = 'F:/goverment/exceloperating/all_tocai_train.csv'
        file_test = 'F:/goverment/exceloperating/all_tocai_test.csv'
    
        importSmallContentdata(file_train,Straindata,Sart_train,Strainlabel,0)
        importSmallContentdata(file_test,Stestdata,Sart_test,Stestlabel,1)
        
        #print("Stestlabel" ,len(Stestlabel))
        
        #print("小类导入数据完毕")
    
        #print("大类标签导入完毕")#共1329*4
        
        
        '''==========================================================tf-idf对Bar进行文本特征提取============================================================================'''
        #导入分词后的词向量bunch对象
        train_bunch_path ="F:/goverment/exceloperating/trainbunch.bat"#Bunch保存路径
        train_space_path = "F:/goverment/exceloperating/traintfdifspace.dat"
        test_bunch_path ="F:/goverment/exceloperating/testbunch.bat"
        test_space_path = "F:/goverment/exceloperating/testtfdifspace.dat"
        stopword_path ="F:/goverment/exceloperating/hlt_stop_words.txt"
    
        '''============================================================tf-idf对Sart进行文本特征提取=============================================================================='''
        buildtrainbunch(train_bunch_path,Sart_train,Strainlabel)
        buildtestbunch(test_bunch_path,Sart_test,Stestlabel)
        
        vector_space(stopword_path,train_bunch_path,train_space_path) 
        testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)
        
        train_set=_readbunchobj(train_space_path)
        test_set=_readbunchobj(test_space_path)
    
        '''训练数据'''
        
        S_vec_train=[]
        getKvector(train_set,S_vec_train,76)
      
        '''测试数据'''
    
        S_vec_test=[]
        getKvector(test_set,S_vec_test,76)
    
    
        '''=================将得到的61个特征和之前的其它特征合并Btraindata=================='''
    
        '''小类训练数据'''
        S_vec_train=np.array(S_vec_train)
        #print(type(S_vec_train))
        #print(S_vec_train.shape)
        Straindata=np.array(Straindata)
        #print(type(Straindata))
        #print(Straindata.shape)
        Straindata=np.hstack((S_vec_train,Straindata))
        #print(Straindata)
        
        '''小类测试数据'''
        S_vec_test=np.array(S_vec_test)
        Stestdata=np.array(Stestdata)
        Stestdata=np.hstack((S_vec_test,Stestdata))
        
    
    
        print("分类算小类精度")
        Strainlabel=np.array(Strainlabel)
        Strainlabel=np.array(Strainlabel)
    
        from xgboost import XGBClassifier 
        clf= XGBClassifier(learning_rate =0.1,
         n_estimators=1150,
         max_depth=2,
         min_child_weight=1,
         gamma=0,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,#没用
         scale_pos_weight=1,#没用
         seed=27)
        clf.fit(Straindata, Strainlabel) 
        predict=clf.predict(Stestdata)
        aa=metrics.accuracy_score(Stestlabel, predict)
        print(aa)#40.09
    
    
        ''''============================输出技术问题及其可能性================'''
        class attri:
            def __init__(self):
                self.key=0
                self.weight=0.0
      
    
        '''====================小类======================='''
        attribute_proba=clf.predict_proba(Stestdata)
        
        
        label=[]
        for i in attribute_proba:
            lis=[]
            k=0
            while k<4:
                k=k+1
                p=1
                mm=0
                sj=-1
                for j in i:
                    sj=sj+1
                    if j>mm:
                        mm=j
                        p=sj
                i[p]=0#难道是从1开始?
                a=attri()
                a.key=p
                a.weight=mm
                lis.append(a)
                #lis.append(p)
            label.append(lis)
        #接下来将label和snew结合,再排序去重就可以和slast比较了
        #print("为什么都超限",len(Snew))
        print("label",len(label))
        count=0
        for lis in label:
            lis.append(Snew[count])
            count=count+1
        print("结合完成,准备去重!")#此时label和Snew的长度都为1439
        
        bol=np.zeros(len(label)+1)
        Snew=[]
        for lis in label:
            if bol[lis[4]]==0:
                Snew.append(lis)
                bol[lis[4]]=1
        
        #print(len(Snew))#去重后为1162
              
        for i in range(len(Slast)+1):
            if i==0:
                continue
            if bol[i]==0:
                ls=[]
                a=attri()
                a.weight=1
                a.key=0
                ls.append(a)
                ls.append(a)
                ls.append(a)
                ls.append(a)
                ls.append(i)
                Snew.append(ls)
        #print("Snew",len(Snew)) #为1329
        
        print("去重完毕,准备排序!")
         
        Snew.sort(key=operator.itemgetter(4)) 
        print("排序完毕,准备比较!")
        
    
        myexcel = xlwt.Workbook()
        sheet = myexcel.add_sheet('sheet')
        si=-2
        sj=-1
        #cys=1
        #print(Snew)
        for i in Snew:
            si=si+2
             #print(si)
             #print("对于记录 %d:" % cys)
            #cys=cys+1
            for j in range(len(i)):
                if(j==len(i)-1):
                    continue
                sj=sj+1
                #sheet.write(si,sj,str(j))
                sheet.write(si,sj,str(i[j].key))
                sheet.write(si+1,sj,str(i[j].weight*100))
                #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100)) 
            sj=-1
        myexcel.save("Snew.xls") 
  • 相关阅读:
    【Redis破障之路】四:Jedis基本使用
    【JVM进阶之路】十二:字节码指令
    Java学习之Spring MVC路由映射
    Java安全之FastJson JdbcRowSetImpl 链分析
    Allatori混淆动态调试分析
    Java工具开发手记
    Java安全之JBoss反序列化漏洞分析
    docker 应用篇————swarm[二十]
    docker 应用篇————docker-compose[十九]
    docker 应用篇————docker 自定义网络[十八]
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/13270936.html
Copyright © 2011-2022 走看看