  • NLP(二十三)使用LSTM进行语言建模以预测最优词


    • N元模型

    • 准备工作
      数据集使用 Alice in Wonderland
    • 如何实现
    • 代码
    from __future__ import print_function
    from sklearn.model_selection import train_test_split
    import nltk
    import numpy as np
    import string
    with open('alice_in_wonderland.txt', 'r') as content_file:
        content = content_file.read()
    content2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in content]).split())
    tokens = nltk.word_tokenize(content2)
    tokens = [word.lower() for word in tokens if len(word)>=2]
    N = 3
    quads = list(nltk.ngrams(tokens,N))
        Return the ngrams generated from a sequence of items, as an iterator.
        For example:
            >>> from nltk.util import ngrams
            >>> list(ngrams([1,2,3,4,5], 3))
            [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
    newl_app = []
    for ln in quads:
        new1 = ' '.join(ln)
    # print(newl_app[:3])
    # 将单词向量化
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer() # 词=>词向量
        >>> corpus = [
        ...     'This is the first document.',
        ...     'This document is the second document.',
        ...     'And this is the third one.',
        ...     'Is this the first document?',
        ... ]
        >>> vectorizer = CountVectorizer()
        >>> X = vectorizer.fit_transform(corpus)
        >>> print(vectorizer.get_feature_names())
        ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
        >>> print(X.toarray())  # doctest: +NORMALIZE_WHITESPACE
        [[0 1 1 1 0 0 1 0 1]
         [0 2 0 1 0 1 1 0 1]
         [1 0 0 1 1 0 1 1 1]
         [0 1 1 1 0 0 1 0 1]]
    x_trigm = []
    y_trigm = []
    for l in newl_app:
        x_str = " ".join(l.split()[0:N-1])
        y_str = l.split()[N-1]
    x_trigm_check = vectorizer.fit_transform(x_trigm).todense()
    y_trigm_check = vectorizer.fit_transform(y_trigm).todense()
    # Dictionaries from word to integer and integer to word
    dictnry = vectorizer.vocabulary_
    rev_dictnry = {v:k for k,v in dictnry.items()}
    X = np.array(x_trigm_check)
    Y = np.array(y_trigm_check)
    Xtrain, Xtest, Ytrain, Ytest,xtrain_tg,xtest_tg = train_test_split(X, Y,x_trigm, test_size=0.3,random_state=1)
    print("X Train shape",Xtrain.shape, "Y Train shape" , Ytrain.shape)
    print("X Test shape",Xtest.shape, "Y Test shape" , Ytest.shape)
    # Model Building
    from keras.layers import Input,Dense,Dropout
    from keras.models import Model
    BATCH_SIZE = 128
    NUM_EPOCHS = 20
    input_layer = Input(shape = (Xtrain.shape[1],),name="input")
    first_layer = Dense(1000,activation='relu',name = "first")(input_layer)
    first_dropout = Dropout(0.5,name="firstdout")(first_layer)
    second_layer = Dense(800,activation='relu',name="second")(first_dropout)
    third_layer = Dense(1000,activation='relu',name="third")(second_layer)
    third_dropout = Dropout(0.5,name="thirdout")(third_layer)
    fourth_layer = Dense(Ytrain.shape[1],activation='softmax',name = "fourth")(third_dropout)
    history = Model(input_layer,fourth_layer)
    history.compile(optimizer = "adam",loss="categorical_crossentropy",metrics=["accuracy"])
    print (history.summary())
    # Model Training
    history.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,epochs=NUM_EPOCHS, verbose=1,validation_split = 0.2)
    # Model Prediction
    Y_pred = history.predict(Xtest)
    # 测试
    print ("Prior bigram words","|Actual","|Predicted","
    import random
    NUM_DISPLAY = 10
    for i in random.sample(range(len(xtest_tg)),NUM_DISPLAY):
        print (i,xtest_tg[i],"|",rev_dictnry[np.argmax(Ytest[i])],"|",rev_dictnry[np.argmax(Y_pred[i])])


    X Train shape (17947, 2559) Y Train shape (17947, 2559)
    X Test shape (7692, 2559) Y Test shape (7692, 2559)
    Layer (type)                 Output Shape              Param #   
    input (InputLayer)           (None, 2559)              0         
    first (Dense)                (None, 1000)              2560000   
    firstdout (Dropout)          (None, 1000)              0         
    second (Dense)               (None, 800)               800800    
    third (Dense)                (None, 1000)              801000    
    thirdout (Dropout)           (None, 1000)              0         
    fourth (Dense)               (None, 2559)              2561559   
    Total params: 6,723,359
    Trainable params: 6,723,359
    Non-trainable params: 0
    Prior bigram words |Actual |Predicted 
    595 words don | fit | know
    3816 in tone | of | of
    5792 queen had | only | been
    2757 who seemed | to | to
    5393 her and | she | she
    4197 heard of | one | its
    2464 sneeze were | the | of
    1590 done with | said | whiting
    3039 and most | things | of
    4226 the queen | of | said


