zoukankan      html  css  js  c++  java
  • 用户评论情感极性判别

    博客搬家:用户评论情感极性判别
    本文章介绍百度点石平台上的一个训练赛的赛题代码,赛题是包括用户评论文字的情感判别的分类问题,赛题链接戳此处

    数据预处理

    使用测试数据和训练数据生成语料库

    import numpy as np
    import jieba
    import codecs
    # 该函数作用是读取文件
    def load_data(file_path):
        data_set = []
        with open(file_path, 'r') as lines:
            for line in lines:
                line=line.strip()
                values=line.split("	")
                data_set.append(values)
        np.array(data_set)
       # print(data_set[0])
        return data_set
    
    
    dataAll=load_data('data_train.csv')
    dataTest=load_data('data_test.csv')
    csvfile = codecs.open("fenci_result.csv", 'w', 'utf-8')
    #f=open('fenci_result.txt','a')
    for item in dataAll:
        seg_list=jieba.cut(item[2])#使用结巴分词
        csvfile.write(" ".join(seg_list))#以空格隔开把分好的词写入文件,形成语料
    #f.close()
    
    for item in dataTest:
        seg_list=jieba.cut(item[-1])
        csvfile.write(" ".join(seg_list))
    

    利用语料库,使用word2vec工具,生成可备用的模型,用于将句子转化为向量

    from gensim.models import word2vec
    import logging
    logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
    sentences = word2vec.Text8Corpus("fenci_result.csv")  # 加载语料
    model = word2vec.Word2Vec(sentences, size = 400)  # 训练skip-gram模型
    
    # 保存模型,以便重用
    model.save("corpus.model")
    model.wv.save_word2vec_format("corpus.model.bin", binary = True)
    

    数据训练与测试

    感觉训练方式很简陋,有待改善

    #本程序用来测试模型
    #coding=utf-8 
    import re
    import numpy as np
    import jieba
    from gensim.models import word2vec
    import logging
    import codecs
    from sklearn.decomposition import PCA
    from sklearn.model_selection import train_test_split
    from sklearn import svm
    from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, precision_score, recall_score, 
        roc_curve  # 导入指标库
    import prettytable  # 导入表格库
    # 该函数作用是读取文件
    def load_data(file_path):
        data_set = []
        with open(file_path, 'r') as lines:
            for line in lines:
                line=line.strip()
                values=line.split("	")
                data_set.append(values)
        np.array(data_set)
       # print(data_set[0])
        return data_set
    #写文件
    def write_result(array, outpuFilePath):
        with open(outpuFilePath, 'w') as output_file:
            for i in range(len(array)):
                output_file.write("%d,%d
    " % (i+1,array[i]))
    #将句子转化为向量
    def getWordVecs(wordList):
        vecs = []
        for word in wordList:
            word = word.strip()
            try:
                vecs.append(model[word])
            except KeyError:
                continue
            
        # vecs = np.concatenate(vecs)
        return np.array(vecs, dtype = 'float')
    
    
    
    
    
    model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True)
    
    # segList=jieba.cut('烤鸭还是不错的,别的菜没什么特殊的')
    # resultList = getWordVecs(segList)
    # print(sum(np.array(resultList))/2)
    
    dataAll=load_data('data_train.csv')
    X=[]
    y=[]
    dataAll=np.array(dataAll[:1500])
    for item in dataAll:
        #temp=int(item[-1])
    
        #y.append(temp if temp!=0 else 1)#把0都替换成1,先对2和1进行分类
        y.append(int(item[-1]))
        segList=jieba.cut(item[2])
        vecList=getWordVecs(segList)
        if len(vecList) != 0:
            X.append(sum(np.array(vecList))/len(vecList))
    X=X[:]
    x_train=np.array(X)
    y_train=np.array(y)
    print(x_train)
    print(y_train)
    
    # x_train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [2, 1], [3, 2]])
    # print(x_train)
    # 使用sklearn的PCA进行维度转换
    model_pca = PCA(n_components=0.95)  # 建立PCA模型对象
    model_pca.fit(x_train)  # 将数据集输入模型
    #model_pca.transform(x_train)  # 对数据集进行转换映射
    newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX
    components = model_pca.components_  # 获得转换后的所有主成分(不明白什么意思)
    components_var = model_pca.explained_variance_  # 获得各主成分的方差
    components_var_ratio = model_pca.explained_variance_ratio_  # 获得各主成分的方差占比
    print("
    主成分分析:")
    print (components)  # 打印输出前2个主成分
    print (len(components_var))  # 打印输出所有主成分的方差
    print (components_var_ratio)  # 打印输出所有主成分的方差占比
    print(len(newX))
    print(len(newX[0]))
    
    
    X_train, X_test, y_train, y_test = train_test_split(newX, y_train, test_size=.3, random_state=0)
    clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr')
    clf.fit(X_train, y_train)
    
    y_hat=clf.predict(X_test)
    
    ##评价指标
    accuracy_s = accuracy_score(y_test, y_hat)  # 准确率
    precision_s = precision_score(y_test, y_hat, average='macro') # 精确度
    recall_s = recall_score(y_test, y_hat, average='macro')  # 召回率
    f1_s = f1_score(y_test, y_hat, average='weighted')  # F1得分
    print('Accuracy:')
    print(accuracy_s)
    print('Precision:')
    print(precision_s)
    print('Recall:')
    print(recall_s)
    print('f-measure:')
    print(f1_s)
    
    ##混淆矩阵
    confusion_m = confusion_matrix(y_test,y_hat)  # 获得混淆矩阵
    confusion_matrix_table = prettytable.PrettyTable()  # 创建表格实例
    confusion_matrix_table.add_row(confusion_m[0, :])  # 增加第一行数据
    confusion_matrix_table.add_row(confusion_m[1, :])  # 增加第二行数据
    confusion_matrix_table.add_row(confusion_m[2, :])  # 增加第三行数据
    print ('confusion matrix')
    print (confusion_matrix_table)  # 打印输出混淆矩阵
    
    
    
    write_result(y_hat,'print.csv')
    

    预测阶段

    使用所有训练数据训练模型并对test数据进行预测

    #本程序用来进行预测
    #coding=utf-8 
    import re
    import numpy as np
    import jieba
    from gensim.models import word2vec
    import logging
    import codecs
    from sklearn.decomposition import PCA
    from sklearn.model_selection import train_test_split
    from sklearn import svm
    # 该函数作用是读取文件
    def load_data(file_path):
        data_set = []
        with open(file_path, 'r') as lines:
            for line in lines:
                line=line.strip()
                values=line.split('	')
                data_set.append(values)
        np.array(data_set)
       # print(data_set[0])
        return data_set
    #写文件
    def write_result(array, outpuFilePath):
        with open(outpuFilePath, 'w') as output_file:
            for i in range(len(array)):
                output_file.write("%d,%d
    " % (i+1,array[i]))
    
    #将句子转化为向量
    def getWordVecs(wordList):
        vecs = []
        for word in wordList:
            word = word.strip()
            try:
                vecs.append(model[word])
            except KeyError:
                continue
            
        # vecs = np.concatenate(vecs)
        return np.array(vecs, dtype = 'float')
    #对预测数据进行处理
    def preDataHandle():
        preData=load_data('data_test.csv')
        #exit(0)
        xPre=[]
        i=0
        k=0
        for item in preData:
            i+=1
            s=''
            for j in range(len(item)):
                if(j>1):
                    s="%s%s"%(s,item[j])
            segList=jieba.cut(s)
            vecList=getWordVecs(segList)
            if len(vecList) != 0:
                xPre.append(sum(np.array(vecList))/len(vecList))
            else:
                k+=1
                print('存在vecList长度为0的情况')
                print(item)
        x_pre=np.array(xPre)
        model_pca = PCA(n_components=factorNum)  # 建立PCA模型对象
        model_pca.fit(x_pre)  # 将数据集输入模型
        x_pre=model_pca.fit_transform(x_pre)#进行转换映射
        return x_pre
    
    
    
    model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True)
    dataAll=load_data('data_train.csv')
    X=[]
    y=[]
    #dataAll=np.array(dataAll[:1500])
    for item in dataAll:
        print(item)
        y.append(int(item[-1]))
        segList=jieba.cut(item[2])
        vecList=getWordVecs(segList)
        if len(vecList) != 0:
            X.append(sum(np.array(vecList))/len(vecList))
        else:
            print(item)
    X=X[:]
    x_train=np.array(X)
    y_train=np.array(y)
    model_pca = PCA(n_components=0.95)  # 建立PCA模型对象
    model_pca.fit(x_train)  # 将数据集输入模型
    #model_pca.transform(x_train)  # 对数据集进行转换映射
    newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX
    factorNum=len(newX[0])
    clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr')
    clf.fit(newX, y_train)
    
    x_pre=preDataHandle()
    y_pre=clf.predict(x_pre)
    
    write_result(y_pre,'output.csv')
    print('Project has been finished successfully!')
    

    比赛平台上计算出的结果f1-score为0.7249,很低,希望再接再厉

  • 相关阅读:
    如何方便的下载youtube视频?
    88. Merge Sorted Array
    83. Remove Duplicates from Sorted List
    70. Climbing Stairs
    用opencv+python全屏进行显示图片
    58. Length of Last Word
    numpy的resize和reshape区别
    Opencv的绘图
    Python的slice问题
    jqgrid一次性加载
  • 原文地址:https://www.cnblogs.com/buptleida/p/12090443.html
Copyright © 2011-2022 走看看