1 import urllib.request 2 import os 3 import tarfile 4 from keras.datasets import imdb 5 from keras.preprocessing import sequence 6 from keras.preprocessing.text import Tokenizer 7 import re 8 def rm_tags(text): 9 re_tag=re.compile(r'<[^>]+>') 10 return re_tag.sub('',text) 11 def read_files(filetype): 12 path="C:/Users/admin/.keras/aclImdb/" 13 file_list=[] 14 positive_path=path+filetype+"/pos/" 15 for f in os.listdir(positive_path): 16 file_list+=[positive_path+f] 17 negative_path=path+filetype+"/pos/" 18 for f in os.listdir(negative_path): 19 file_list+=[negative_path+f] 20 print('read',filetype,'files:',len(file_list)) 21 all_labels=([1]*12500+[0]*12500) 22 all_texts=[] 23 for fi in file_list: 24 with open(fi,encoding='utf8') as file_input: 25 all_texts+=[rm_tags(" ".join(file_input.readlines()))] 26 return all_labels,all_texts 27 y_train,train_text=read_files("train") 28 y_test,test_text=read_files("test") 29 print(train_text[0]) 30 print(y_train[0]) 31 token=Tokenizer(num_words=2000) 32 token.fit_on_texts(train_text) 33 print(token.document_count) 34 print(token.word_index) 35 x_train_seq=token.texts_to_sequences(train_text) 36 x_test_seq=token.texts_to_sequences(test_text) 37 print(train_text[0]) 38 print(x_train_seq[0]) 39 x_train=sequence.pad_sequences(x_train_seq,maxlen=100) 40 x_test=sequence.pad_sequences(x_test_seq,maxlen=100) 41 print('before pad_sequences lenfth=',len(x_train_seq[113])) 42 print(x_train_seq[113]) 43 print('after pad_sequences lenfth=',len(x_train[113])) 44 print(x_train[113]) 45 from keras.models import Sequential 46 from keras.layers import Dense,Dropout,Flatten,Activation 47 from keras.layers.embeddings import Embedding 48 model=Sequential() 49 model.add(Embedding(output_dim=32, 50 input_dim=2000, 51 input_length=100)) 52 model.add(Dropout(0.2)) 53 #model.add(SimpleRNN(units=16)) 54 model.add(Flatten()) 55 model.add(Dense(units=256, 56 activation='relu')) 57 model.add(Dropout(0.35)) 58 model.add(Dense(units=1, 59 activation='sigmoid')) 60 print(model.summary()) 61 model.compile(loss='binary_crossentropy', 62 optimizer='adam', 63 metrics=['accuracy']) 64 train_history=model.fit(x=x_train,y=y_train,batch_size=100, 65 epochs=10,verbose=2, 66 validation_split=0.2) 67 scores=model.evaluate(x_test,y_test,verbose=1) 68 print('accuracy',scores[1]) 69 predict=model.predict_classes(x_test) 70 print("prediction[:10]",predict[:10]) 71 predict_classes=predict.reshape(-1) 72 print(predict_classes[:10]) 73 SentimentDict = {1: '正面的', 0: '负面的'} 74 def display_test_Sentiment(i): 75 print(test_text[i]) 76 print('label真实值:', SentimentDict[y_test[i]], 77 '预测结果:', SentimentDict[predict_classes[i]]) 78 display_test_Sentiment(12502) 79 input_text=''' 80 I saw this film with my 6-year-old a couple weeks ago. While there's plenty about which to gripe, here's one of 81 my biggest problems: I can't stand this constant CGI-heavy everything-must-be-a-sequel-or- a- remake era of film 82 making. It's making movie makers lazy. 83 ''' 84 input_seq=token.texts_to_sequences([input_text]) 85 len(input_seq[0]) 86 print(input_seq[0]) 87 pad_input_seq=sequence.pad_sequences(input_seq,maxlen=100) 88 len(pad_input_seq[0]) 89 print(pad_input_seq[0]) 90 predict_result=model.predict_classes(pad_input_seq) 91 print(predict_result) 92 print(predict_result[0][0]) 93 print(SentimentDict[predict_result[0][0]]) 94 def predict_review(input_text): 95 input_seq=token.texts_to_sequences([input_text]) 96 pad_input_seq=sequence.pad_sequences(input_seq,maxlen=100) 97 predict_result=model.predict_classes(pad_input_seq) 98 print(SentimentDict[predict_result[0][0]]) 99 100 predict_review(''' 101 They poured on the whole "LeFou is gay" thing a bit thick for my taste. It was the only thing that added levity to the movie (despite how much fun it should have been already), but it seemed a bit cheap. I'm not going to apologize for wanting more for my LGBTQ characters than to be just the comic relief. 102 ''')
验证的准确率为0问题待解决