zoukankan      html  css  js  c++  java
  • 使用CNN做电影评论的负面检测——本质上感觉和ngram或者LSTM同,因为CNN里图像检测卷积一般是3x3,而文本分类的话是直接是一维的3、4、5

    代码如下:

    from __future__ import division, print_function, absolute_import
    
    import tensorflow as tf
    import tflearn
    from tflearn.layers.core import input_data, dropout, fully_connected
    from tflearn.layers.conv import conv_1d, global_max_pool
    from tflearn.layers.merge_ops import merge
    from tflearn.layers.estimator import regression
    from tflearn.data_utils import to_categorical, pad_sequences
    from tflearn.datasets import imdb
    import os
    from tensorflow.contrib.learn.python import learn
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    import numpy as np
    
    MAX_DOCUMENT_LENGTH = 200
    EMBEDDING_SIZE = 50
    
    n_words=0
    
    
    def load_one_file(filename):
        x=""
        with open(filename) as f:
            for line in f:
                x+=line
        return x
    
    def load_files(rootdir,label):
        list = os.listdir(rootdir)
        x=[]
        y=[]
        for i in range(0, len(list)):
            path = os.path.join(rootdir, list[i])
            if os.path.isfile(path):
                #print "Load file %s" % path
                y.append(label)
                x.append(load_one_file(path))
    
        return x,y 
    
    
    def load_data():
        x=[]
        y=[]
        x1,y1=load_files("../data/movie-review-data/review_polarity/txt_sentoken/pos/",0)
        x2,y2=load_files("../data/movie-review-data/review_polarity/txt_sentoken/neg/", 1)
        x=x1+x2
        y=y1+y2
        return x,y 
    def  do_cnn(trainX, trainY,testX, testY):
        global n_words
        # Data preprocessing
        # Sequence padding
        trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
        testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
        # Converting labels to binary vectors
        trainY = to_categorical(trainY, nb_classes=2)
        testY = to_categorical(testY, nb_classes=2)
        # Building convolutional network
        network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
        network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
        branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
        branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
        branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
        network = merge([branch1, branch2, branch3], mode='concat', axis=1)
        network = tf.expand_dims(network, 2)
        network = global_max_pool(network)
        network = dropout(network, 0.5)
        network = fully_connected(network, 2, activation='softmax')
        network = regression(network, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy', name='target')
        # Training
        model = tflearn.DNN(network, tensorboard_verbose=0)
        model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
    
    if __name__ == '__main__':
        # IMDB Dataset loading
        global n_words
    
        x,y=load_data()
    
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
    
        vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1)
        vp.fit(x)
        x_train = np.array(list(vp.transform(x_train)))
        x_test = np.array(list(vp.transform(x_test)))
        n_words=len(vp.vocabulary_)
        print('Total words: %d' % n_words)
    
        do_cnn(x_train, y_train,x_test, y_test)
                                                          

    准确率是100%

  • 相关阅读:
    MyBatis映射文件中用#和$传递参数的特点
    使用谷歌浏览器进行Web开发技巧
    YYYY-mm-dd HH:MM:SS 备忘录
    java通过UUID生成16位唯一订单号
    idea如何设置类头注释和方法注释
    如何用符号构建人的思维系统?
    临界点思维模型
    复利思维模型-拥抱人生的指数增长
    提升自我认知的有效方式
    如何去培养顶尖的思维模型?
  • 原文地址:https://www.cnblogs.com/bonelee/p/7908346.html
Copyright © 2011-2022 走看看