zoukankan      html  css  js  c++  java
  • Keras下的文本情感分析简介。与MLP,RNN,LSTM模型下的文本情感测试

      1 # coding: utf-8
      2 
      3 # In[1]:
      4 
      5 
      6 import urllib.request
      7 import os
      8 import tarfile
      9 
     10 
     11 # In[2]:
     12 
     13 
     14 url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
     15 filepath="example/data/aclImdb_v1.tar.gz"
     16 if not os.path.isfile(filepath):
     17     result=urllib.request.urlretrieve(url,filepath)
     18     print('downloaded:',result)
     19 if not os.path.exists("example/data/aclImdb_v1/aclImdb"):
     20     tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
     21     result=tfile.extractall('data/')
     22 
     23 
     24 # In[3]:
     25 
     26 
     27 from keras.datasets import imdb
     28 from keras.preprocessing import sequence
     29 from keras.preprocessing.text import Tokenizer
     30 
     31 
     32 # In[4]:
     33 
     34 
     35 import re
     36 def rm_tags(text):
     37     re_tag = re.compile(r'<[^>]+>')
     38     return re_tag.sub('', text)
     39 
     40 
     41 # In[5]:
     42 
     43 
     44 import os
     45 def read_files(filetype):
     46     path = "example/data/aclImdb_v1/aclImdb/"
     47     file_list=[]
     48 
     49     positive_path=path + filetype+"/pos/"
     50     for f in os.listdir(positive_path):
     51         file_list+=[positive_path+f]
     52     
     53     negative_path=path + filetype+"/neg/"
     54     for f in os.listdir(negative_path):
     55         file_list+=[negative_path+f]
     56         
     57     print('read',filetype, 'files:',len(file_list))
     58     all_labels = ([1] * 12500 + [0] * 12500) 
     59     
     60     all_texts  = []
     61     for fi in file_list:
     62         with open(fi,encoding='utf8') as file_input:
     63             all_texts += [rm_tags(" ".join(file_input.readlines()))]
     64             
     65     return all_labels,all_texts
     66 
     67 
     68 # In[6]:
     69 
     70 
     71 y_train,train_text=read_files("train")
     72 
     73 
     74 # In[7]:
     75 
     76 
     77 y_test,test_text=read_files("test")
     78 
     79 
     80 # In[8]:
     81 
     82 
     83 train_text[0]
     84 
     85 
     86 # In[9]:
     87 
     88 
     89 y_train[0]
     90 
     91 
     92 # In[10]:
     93 
     94 
     95 train_text[12500]
     96 
     97 
     98 # In[11]:
     99 
    100 
    101 y_train[12500]
    102 
    103 
    104 # In[12]:
    105 
    106 
    107 token = Tokenizer(num_words=2000)
    108 token.fit_on_texts(train_text)
    109 
    110 
    111 # In[13]:
    112 
    113 
    114 print(token.document_count)
    115 print(token.word_index)
    116 
    117 
    118 # In[14]:
    119 
    120 
    121 x_train_seq = token.texts_to_sequences(train_text)
    122 x_test_seq  = token.texts_to_sequences(test_text)
    123 
    124 
    125 # In[15]:
    126 
    127 
    128 print(x_train_seq[0])
    129 
    130 
    131 # In[16]:
    132 
    133 
    134 x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
    135 x_test  = sequence.pad_sequences(x_test_seq,  maxlen=100)
    136 
    137 
    138 # In[17]:
    139 
    140 
    141 x_train[0]
    142 
    143 
    144 # In[18]:
    145 
    146 
    147 from keras.models import Sequential
    148 from keras.layers.core import Dense, Dropout, Activation,Flatten
    149 from keras.layers.embeddings import Embedding
    150 model = Sequential()
    151 model.add(Embedding(output_dim=32,
    152                     input_dim=2000, 
    153                     input_length=100))
    154 model.add(Dropout(0.2))
    155 model.add(Flatten())
    156 model.add(Dense(units=256,
    157                 activation='relu' ))
    158 model.add(Dropout(0.2))
    159 model.add(Dense(units=1,
    160                 activation='sigmoid' ))
    161 model.summary()
    162 
    163 
    164 # In[19]:
    165 
    166 
    167 model.compile(loss='binary_crossentropy', 
    168               optimizer='adam', 
    169               metrics=['accuracy'])
    170 train_history =model.fit(x_train, y_train,batch_size=100, 
    171                          epochs=10,verbose=2,
    172                          validation_split=0.2)
    173 
    174 
    175 # In[20]:
    176 
    177 
    178 get_ipython().magic('pylab inline')
    179 import matplotlib.pyplot as plt
    180 def show_train_history(train_history,train,validation):
    181     plt.plot(train_history.history[train])
    182     plt.plot(train_history.history[validation])
    183     plt.title('Train History')
    184     plt.ylabel(train)
    185     plt.xlabel('Epoch')
    186     plt.legend(['train', 'validation'], loc='upper left')
    187     plt.show()
    188 
    189 
    190 # In[21]:
    191 
    192 
    193 show_train_history(train_history,'acc','val_acc')
    194 show_train_history(train_history,'loss','val_loss')
    195 
    196 
    197 # In[22]:
    198 
    199 
    200 scores = model.evaluate(x_test, y_test, verbose=1)
    201 scores[1]
    202 
    203 
    204 # In[23]:
    205 
    206 
    207 probility=model.predict(x_test)
    208 
    209 
    210 # In[24]:
    211 
    212 
    213 probility[:10]
    214 
    215 
    216 # In[25]:
    217 
    218 
    219 probility[12500:12510]
    220 
    221 
    222 # In[26]:
    223 
    224 
    225 predict=model.predict_classes(x_test)
    226 
    227 
    228 # In[27]:
    229 
    230 
    231 predict_classes=predict.reshape(-1)
    232 
    233 
    234 # In[28]:
    235 
    236 
    237 SentimentDict={1:'正面的',0:'负面的'}
    238 def display_test_Sentiment(i):
    239     print(test_text[i])
    240     print('标签label:',SentimentDict[y_test[i]],
    241           '预测结果:',SentimentDict[predict_classes[i]])
    242 
    243 
    244 # In[29]:
    245 
    246 
    247 display_test_Sentiment(2)
    248 
    249 
    250 # In[30]:
    251 
    252 
    253 display_test_Sentiment(12505)
    254 
    255 
    256 # In[31]:
    257 
    258 
    259 from keras.models import Sequential
    260 from keras.layers.core import Dense, Dropout, Activation
    261 from keras.layers.embeddings import Embedding
    262 from keras.layers.recurrent import SimpleRNN
    263 model = Sequential()
    264 model.add(Embedding(output_dim=32,
    265                     input_dim=2000, 
    266                     input_length=100))
    267 model.add(Dropout(0.35))
    268 model.add(SimpleRNN(units=16))
    269 model.add(Dense(units=256,activation='relu' ))
    270 model.add(Dropout(0.35))
    271 model.add(Dense(units=1,activation='sigmoid' ))
    272 model.summary()
    273 
    274 
    275 # In[32]:
    276 
    277 
    278 model.compile(loss='binary_crossentropy', 
    279               optimizer='adam', 
    280               metrics=['accuracy'])
    281 train_history =model.fit(x_train, y_train,batch_size=100, 
    282                          epochs=10,verbose=2,
    283                          validation_split=0.2)
    284 
    285 
    286 # In[33]:
    287 
    288 
    289 scores = model.evaluate(x_test, y_test, verbose=1)
    290 scores[1]
    291 
    292 
    293 # In[34]:
    294 
    295 
    296 from keras.models import Sequential
    297 from keras.layers.core import Dense, Dropout, Activation,Flatten
    298 from keras.layers.embeddings import Embedding
    299 from keras.layers.recurrent import LSTM
    300 model = Sequential()
    301 model.add(Embedding(output_dim=32,
    302                     input_dim=2000, 
    303                     input_length=100))
    304 model.add(Dropout(0.2))
    305 model.add(LSTM(32))
    306 model.add(Dense(units=256,
    307                 activation='relu' ))
    308 model.add(Dropout(0.2))
    309 model.add(Dense(units=1,
    310                 activation='sigmoid' ))
    311 model.summary()
    312 
    313 
    314 # In[35]:
    315 
    316 
    317 model.compile(loss='binary_crossentropy', 
    318               #optimizer='rmsprop', 
    319               optimizer='adam', 
    320               metrics=['accuracy'])
    321 train_history =model.fit(x_train, y_train,batch_size=100, 
    322                          epochs=10,verbose=2,
    323                          validation_split=0.2)
    324 
    325 
    326 # In[36]:
    327 
    328 
    329 show_train_history(train_history,'acc','val_acc')
    330 show_train_history(train_history,'loss','val_loss')
    331 scores = model.evaluate(x_test, y_test, verbose=1)
    332 scores[1]
    333 
    334 
    335 # In[ ]:
    View Code

     文本来源于IMDb网络电影数据集。下载,放到合适的路径下,然后,开始。

    过滤掉HTML标签。因为数据集中有相关标签。:

    之后读取所有数据和目标标签,然后建立字典:

    将文本转化为数字串:

    格式化数字串长度为100

    建立MLP模型,其中嵌入层将每个长度为100的数字串转为100个32维的向量,将文字映射成多维的几何空间向量,让每一个文字有上下的关联性。

    编译,训练,绘图,评估后的准确率:

    建立RNN模型,有关RNN模型的介绍:https://www.cnblogs.com/bai2018/p/10466418.html

     

    测试评估:

    建立LSTM模型,相关介绍:https://www.cnblogs.com/bai2018/p/10466497.html

     准确率:

  • 相关阅读:
    Linux系统调用
    Kubernetes 中强化tab 功能
    Docker镜像构建之案例分享
    网络基础之名词介绍
    网络基础协议之UDP(下篇)
    网络基础协议之UDP(上篇)
    内核升级
    尼恩 Java高并发三部曲 [官方]
    CDN图解(秒懂
    DNS图解(秒懂
  • 原文地址:https://www.cnblogs.com/bai2018/p/10466161.html
Copyright © 2011-2022 走看看