zoukankan      html  css  js  c++  java
  • NLP(二十四)使用LSTM构建生成式聊天机器人

    原文链接:http://www.one2know.cn/nlp24/

    • 准备
      数据集:AIML数据集
      下载数据集并用Notepad++打开,复制到txt文件中方便打开
    • 代码实现
      数据很少,训练轮次不多,结果不好,仅当示例
    import numpy as np
    import pandas as pd
    
    with open('bot.txt','r') as content_file:
        botdata = content_file.read()
    Questions = []
    Answers = []
    
    for line in botdata.split('</pattern>'):
        if '<pattern>' in line:
            Quesn = line[line.find('<pattern>')+len('<pattern>'):]
            Questions.append(Quesn.lower())
    for line in botdata.split('</template>'):
        if '<template>' in line:
            Ans = line[line.find('<template>')+len('<template>'):]
            Answers.append(Ans.lower())
    QnAdata = pd.DataFrame(np.column_stack([Questions,Answers]),columns=['Questions','Answers'])
    QnAdata['QnAcomb'] = QnAdata['Questions'] + ' ' + QnAdata['Answers']
    print(QnAdata[:5])
    
    import nltk
    import collections
    
    ## 向量化
    counter = collections.Counter()
    for i in range(len(QnAdata)):
        for word in nltk.word_tokenize(QnAdata.iloc[i][2]):
            counter[word] += 1
    word2idx = {w:(i+1) for i,(w,_) in enumerate(counter.most_common())}
    idx2word = {v:k for k,v in word2idx.items()}
    idx2word[0] = 'PAD'
    vocab_size = len(word2idx) + 1
    print('
    Vocabulary size:',vocab_size)
    
    def encode(sentence, maxlen,vocab_size):
        indices = np.zeros((maxlen, vocab_size))
        for i, w in enumerate(nltk.word_tokenize(sentence)):
            if i == maxlen: break
            indices[i, word2idx[w]] = 1
        return indices
    
    def decode(indices, calc_argmax=True):
        if calc_argmax:
            indices = np.argmax(indices, axis=-1)
        return ' '.join(idx2word[x] for x in indices)
    
    question_maxlen = 10
    answer_maxlen = 20
    
    def create_questions(question_maxlen,vocab_size):
        question_idx = np.zeros(shape=(len(Questions),question_maxlen,vocab_size))
        for q in range(len(Questions)):
            question = encode(Questions[q],question_maxlen,vocab_size)
            question_idx[i] = question
        return question_idx
    
    quesns_train = create_questions(question_maxlen=question_maxlen,vocab_size=vocab_size)
    
    def create_answers(answer_maxlen,vocab_size):
        answer_idx = np.zeros(shape=(len(Answers),answer_maxlen,vocab_size))
        for q in range(len(Answers)):
            answer = encode(Answers[q],answer_maxlen,vocab_size)
            answer_idx[i] = answer
        return answer_idx
    
    answs_train = create_answers(answer_maxlen=answer_maxlen,vocab_size=vocab_size)
    
    from keras.layers import Input,Dense,Dropout,Activation
    from keras.models import Model
    from keras.layers.recurrent import LSTM
    from keras.layers.wrappers import Bidirectional
    from keras.layers import RepeatVector,TimeDistributed,ActivityRegularization
    
    n_hidden = 128
    
    question_layer = Input(shape=(question_maxlen,vocab_size))
    
    encoder_rnn = LSTM(n_hidden,dropout=0.2,recurrent_dropout=0.2)(question_layer)
    # encoder_rnn = Bidirectional(LSTM(n_hidden,dropout=0.2,recurrent_dropout=0.2),merge_mode='concat')(question_layer)
    # RNN的双向包装 向前和向后RNN的输出将合并
    # merge_mode(合并模型)参数:{'sum', 'mul', 'concat', 'ave', None}
    
    repeat_encode = RepeatVector(answer_maxlen)(encoder_rnn)
    # 重复输入n次 shape加了一维 比如(a,b,c)=>(n,a,b,c)
    
    dense_layer = TimeDistributed(Dense(vocab_size))(repeat_encode)
    # TimeDistributed和Dense一起使用,
    # 在静态形状中查找非特定维度,并用张量的相应动态形状代替它们
    
    regularized_layer = ActivityRegularization(l2=1)(dense_layer)
    # 对基于代价函数的输入活动应用更新的层
    
    softmax_layer = Activation('softmax')(regularized_layer)
    
    model = Model([question_layer],[softmax_layer])
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    print(model.summary())
    
    # 模型训练
    quesns_train_2 = quesns_train.astype('float32')
    answs_train_2 = answs_train.astype('float32')
    
    model.fit(quesns_train_2, answs_train_2,batch_size=32,epochs=30,validation_split=0.05)
    
    # 模型预测
    ans_pred = model.predict(quesns_train_2[0:3])
    print(decode(ans_pred[0]))
    print(decode(ans_pred[1]))
    
  • 相关阅读:
    文件处理seek以及修改内容的两种方式
    三元表达式、列表推导式、生成器表达式、递归、匿名函数、内置函数
    MySQL逻辑查询语句执行顺序
    函数基础
    迭代器、生成器、面向过程编程
    3 函数
    文件处理
    字符编码
    2 数据类型、字符编码、文件处理
    转载文章之提供给开发者 10 款最好的 Python IDE
  • 原文地址:https://www.cnblogs.com/peng8098/p/nlp_24.html
Copyright © 2011-2022 走看看