zoukankan      html  css  js  c++  java
  • ptyhon中文本挖掘精简版

    import xlrd
    import jieba
    import sys  
    import importlib
    import os         #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
    import pickle    #导入cPickle包并且取一个别名pickle #持久化类
    import random
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from pylab import mpl  
    from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
    from sklearn import svm
    
    from sklearn import metrics 
    from sklearn.datasets.base import Bunch
    from sklearn.feature_extraction.text import TfidfVectorizer
    importlib.reload(sys)
    
    
    #把内容和类别转化成一个向量的形式
    trainContentdatasave=[] #存储所有训练和测试数据的分词
    testContentdatasave=[]
    
    trainContentdata = []
    testContentdata = []
    trainlabeldata = []
    testlabeldata = []
    
    #导入文本描述的训练和测试数据
    def importTrainContentdata():
        file = '20180716_train.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            trainContentdata.append(ws.cell(r, 0).value)
    
    def importTestContentdata():
        file = '20180716_test.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            testContentdata.append(ws.cell(r, 0).value)   
    
    #导入类别的训练和测试数据
    def importTrainlabeldata():
        file = '20180716_train_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            trainlabeldata.append(ws.cell(r, 0).value)
            
    def importTestlabeldata():
        file = '20180716_test_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            testlabeldata.append(ws.cell(r, 0).value)
    
    
    if __name__=="__main__": 
        
        importTrainContentdata()
        importTestContentdata()
        importTrainlabeldata()
        importTestlabeldata()
        
        '''贝叶斯
        clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
        #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')
        clf.fit(train_set.tdm, train_set.label)  
        predicted=clf.predict(test_set.tdm)
        
        逻辑回归
        tv = TfidfVectorizer()
        train_data = tv.fit_transform(X_train)
        test_data = tv.transform(X_test)
        
        lr = LogisticRegression(C=3)
        lr.fit(train_set.tdm, train_set.label)
        predicted=lr.predict(test_set.tdm)
        print(lr.score(test_set.tdm, test_set.label))
        #print(test_set.tdm)
        
        #SVM
        clf = SVC(C=1500)
        clf.fit(train_set.tdm, train_set.label)
        predicted=clf.predict(test_set.tdm)
        print(clf.score(test_set.tdm, test_set.label))
        '''
        
        tv = TfidfVectorizer()
        train_data = tv.fit_transform(trainContentdata)
        test_data = tv.transform(testContentdata)
    
        clf = SVC(C=1500)
        clf.fit(train_data, trainlabeldata)
        print(clf.score(test_data, testlabeldata))
        
        
        
        a=[]
        b=[]
        for i in range(len(predicted)):
            b.append((int)(float(predicted[i])))
            a.append(int(test_set.label[i][0]))
        
        '''
        f=open('F:/goverment/ArticleMining/predict.txt', 'w')
        for i in range(len(predicted)):
           f.write(str(b[i]))
           f.write('
    ')
        f.write("写好了")
        f.close()
        #for i in range(len(predicted)):
            #print(b[i])
        '''
        #metrics_result(a, b)
  • 相关阅读:
    Java中的static关键字解析
    Hadoop记录-metastore jmx配置
    Hadoop记录-hadoop jmx配置
    Hadoop记录-yarn ResourceManager Active频繁易主问题排查(转载)
    Hadoop记录-hive merge小文件
    Linux记录-salt命令
    Hadoop记录-Hadoop监控指标汇总
    Hadoop记录-日常运维操作
    Hadoop记录-技术网站
    Hadoop记录-Hadoop jmx
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/13270962.html
Copyright © 2011-2022 走看看