zoukankan      html  css  js  c++  java
  • NLP_TF2_ZERO TO HERO

    一个 Tokenization 的 demo

    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    # import a list of sentences and use Tokenizer to transform words  to token
    sentences = [
        'I love my dog',
        'I love my cat',
        # Tokenizer is smart because it can ingore some punctuation like '!'
        # as you can see the results of tokenization don't contain '!'
        'You love my dog!',
        # the sentences above are all just contain four words
        # so we add a sentence composed by five words
        'Do you think my dog is amazing?'
    ]
    # tokenization => encoding
    # tokenizer = Tokenizer(num_words = 100)
    # we can add a default value to represent words that we didn't import before
    tokenizer = Tokenizer(num_words = 100,oov_token="<OOV>")
    # import sentences
    tokenizer.fit_on_texts(sentences)
    # get tokens of them
    word_index = tokenizer.word_index
    
    # tokenizer.texts_to_sequences() create a list of token sequnences which aganist each sentences
    sequences = tokenizer.texts_to_sequences(sentences)
    # padding can keep sequences' lengthes same to the max length by add '0' to the front of their token list
    # padded = pad_sequences(sequences)
    # if you want to add '0' to ten end of the sequences, just add the attribute 'padding='
    padded = pad_sequences(sequences,padding='post')
    # if you want the results's length don't equal to the max length, just add the attribute 'maxlen'
    # padded = pad_sequences(sequences,padding='post',maxlen=5)
    print(word_index)
    print(sequences)
    print(padded)
    
    '''
    output:
    {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
    [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
    [[ 5  3  2  4  0  0  0]
     [ 5  3  2  7  0  0  0]
     [ 6  3  2  4  0  0  0]
     [ 8  6  9  2  4 10 11]]
    '''
    
    
    test_data = [
        'i really love my dog',
        'my dog loves my manatee'  
    ]
    # we import some sentences that contain non-tokenized words
    # see what will happend
    test_seq = tokenizer.texts_to_sequences(test_data)
    
    print(test_seq)
    # output is 'i really love my dog'->[4, 2, 1, 3]
    # we can see that a 5 words sentence just generate a 4 tokens sequence
    # because our word_index don't have a token to represent 'really'
    # after we add a default value such as <OOV> to the tokenizer
    # output is 'i really love my dog'->[5, 1, 3, 2, 4]
    # we can use this method to keep our sequences have same length with it's orgin sentences
    # But we can use other more powerful method such 
    # as Ragged Tensor(不规则张量) and padding to handle with the issue of different length between raw sentences and output sequences
    
    
    '''
    [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
    '''
    

    文字情感检测

    import json
    import tensorflow as tf
    import numpy as np
    sentences = []
    labels = []
    urls = []
    
    with open("Sarcasm_Headlines_Dataset.json",'r') as f:
        while True:
            line = f.readline()
            if not line: # 到 EOF,返回空字符串,则终止循环
                break
            datastore = json.loads(line)
            sentences.append(datastore['headline'])
            labels.append(datastore['is_sarcastic'])
            urls.append(datastore['article_link'])
            
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    # tokenizer = Tokenizer(oov_token="<OOV>")
    # tokenizer.fit_on_texts(sentences)
    
    #word_index = tokenizer.word_index
    
    # sequences = tokenizer.texts_to_sequences(sentences)
    
    # padded = pad_sequences(sequences,padding='post')
    
    #print(word_index)
    
    #print(padded[0])
    
    #print(padded.shape)
    
    vocab_size = 10000
    oov_tok = "<OOV>"
    max_length = 100
    padding_type = 'post'
    trunc_type = 'post'
    embedding_dim = 16
    training_size = 20000
    
    training_sentences = sentences[0:training_size]
    testing_sentences = sentences[training_size:]
    
    training_labels = labels[0:training_size]
    testing_labels = labels[training_size:]
    
    # to meet training's need, we need to rewrite the function
    tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
    # just import training data
    tokenizer.fit_on_texts(training_sentences)
    
    word_index = tokenizer.word_index
    
    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences,maxlen=max_length,
                                    padding=padding_type,truncating=trunc_type)
    
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                                    padding=padding_type,truncating=trunc_type)
    training_padded = np.array(training_padded)
    testing_padded = np.array(testing_padded)
    training_labels = np.array(training_labels)
    testing_labels = np.array(testing_labels)
    
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        #pooling add the word vectors
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64,activation='relu'),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    num_epochs = 30
    
    history = model.fit(training_padded, training_labels, epochs=num_epochs,
                        validation_data=(testing_padded,testing_labels),verbose=2)
    
    
    '''
    Epoch 1/30
    625/625 - 1s - loss: 0.6411 - accuracy: 0.6237 - val_loss: 0.4966 - val_accuracy: 0.8125
    Epoch 2/30
    625/625 - 1s - loss: 0.3776 - accuracy: 0.8454 - val_loss: 0.3656 - val_accuracy: 0.8471
    Epoch 3/30
    625/625 - 1s - loss: 0.2841 - accuracy: 0.8874 - val_loss: 0.3443 - val_accuracy: 0.8566
    Epoch 4/30
    625/625 - 1s - loss: 0.2369 - accuracy: 0.9058 - val_loss: 0.3590 - val_accuracy: 0.8498
    Epoch 5/30
    625/625 - 1s - loss: 0.2017 - accuracy: 0.9234 - val_loss: 0.3517 - val_accuracy: 0.8548
    Epoch 6/30
    625/625 - 1s - loss: 0.1749 - accuracy: 0.9354 - val_loss: 0.3794 - val_accuracy: 0.8489
    Epoch 7/30
    625/625 - 1s - loss: 0.1555 - accuracy: 0.9427 - val_loss: 0.3870 - val_accuracy: 0.8533
    Epoch 8/30
    625/625 - 1s - loss: 0.1366 - accuracy: 0.9512 - val_loss: 0.4144 - val_accuracy: 0.8472
    Epoch 9/30
    625/625 - 1s - loss: 0.1237 - accuracy: 0.9553 - val_loss: 0.4326 - val_accuracy: 0.8475
    Epoch 10/30
    625/625 - 1s - loss: 0.1111 - accuracy: 0.9622 - val_loss: 0.4575 - val_accuracy: 0.8463
    Epoch 11/30
    625/625 - 1s - loss: 0.1004 - accuracy: 0.9647 - val_loss: 0.4880 - val_accuracy: 0.8430
    Epoch 12/30
    625/625 - 1s - loss: 0.0892 - accuracy: 0.9718 - val_loss: 0.5408 - val_accuracy: 0.8357
    Epoch 13/30
    625/625 - 1s - loss: 0.0815 - accuracy: 0.9732 - val_loss: 0.5543 - val_accuracy: 0.8378
    Epoch 14/30
    625/625 - 1s - loss: 0.0751 - accuracy: 0.9748 - val_loss: 0.5920 - val_accuracy: 0.8346
    Epoch 15/30
    625/625 - 1s - loss: 0.0666 - accuracy: 0.9785 - val_loss: 0.6260 - val_accuracy: 0.8310
    Epoch 16/30
    625/625 - 1s - loss: 0.0617 - accuracy: 0.9791 - val_loss: 0.6854 - val_accuracy: 0.8287
    Epoch 17/30
    625/625 - 1s - loss: 0.0567 - accuracy: 0.9814 - val_loss: 0.6992 - val_accuracy: 0.8268
    Epoch 18/30
    625/625 - 1s - loss: 0.0514 - accuracy: 0.9837 - val_loss: 0.7647 - val_accuracy: 0.8240
    Epoch 19/30
    625/625 - 1s - loss: 0.0482 - accuracy: 0.9848 - val_loss: 0.8646 - val_accuracy: 0.8180
    Epoch 20/30
    625/625 - 1s - loss: 0.0424 - accuracy: 0.9877 - val_loss: 0.8183 - val_accuracy: 0.8207
    Epoch 21/30
    625/625 - 1s - loss: 0.0393 - accuracy: 0.9888 - val_loss: 0.9400 - val_accuracy: 0.8186
    Epoch 22/30
    625/625 - 1s - loss: 0.0371 - accuracy: 0.9883 - val_loss: 0.9198 - val_accuracy: 0.8171
    Epoch 23/30
    625/625 - 1s - loss: 0.0346 - accuracy: 0.9894 - val_loss: 0.9481 - val_accuracy: 0.8165
    Epoch 24/30
    625/625 - 1s - loss: 0.0296 - accuracy: 0.9914 - val_loss: 0.9933 - val_accuracy: 0.8144
    Epoch 25/30
    625/625 - 1s - loss: 0.0278 - accuracy: 0.9920 - val_loss: 1.1238 - val_accuracy: 0.8129
    Epoch 26/30
    625/625 - 1s - loss: 0.0278 - accuracy: 0.9919 - val_loss: 1.0576 - val_accuracy: 0.8140
    Epoch 27/30
    625/625 - 1s - loss: 0.0241 - accuracy: 0.9930 - val_loss: 1.1133 - val_accuracy: 0.8126
    Epoch 28/30
    625/625 - 1s - loss: 0.0254 - accuracy: 0.9923 - val_loss: 1.2534 - val_accuracy: 0.8109
    Epoch 29/30
    625/625 - 1s - loss: 0.0211 - accuracy: 0.9937 - val_loss: 1.1678 - val_accuracy: 0.8095
    Epoch 30/30
    625/625 - 1s - loss: 0.0190 - accuracy: 0.9946 - val_loss: 1.2294 - val_accuracy: 0.8097
    '''
    # test
    sentence = [
        "granny starting to fear spiders in the garden might be real",
        "the weather today is bright and sunny"
    ]
    
    sequences = tokenizer.texts_to_sequences(sentence)
    
    padded = pad_sequences(sequences, maxlen=max_length,padding=padding_type,truncating=trunc_type)
    
    print(model.predict(padded))
    
    
    # 向模型中引入LSTM
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64),
        # import LSTM
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64,activation='relu'),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    num_epochs = 30
    
    history = model.fit(training_padded, training_labels, epochs=num_epochs,
                        validation_data=(testing_padded,testing_labels),verbose=2)
    

    poetry_creater

    !wget https://storage.googleapis.com/laurencemoroney-blog.appspot.com /eat_tensorflow2_in_30_days/irish-lyrics-eof.txt -O /irish-lyrics-eof.txt --no-check-certificate  
    
    import tensorflow as tf
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.optimizers import Adam
    import numpy as np
    
    tokenizer = Tokenizer()
    
    data = open('irish-lyrics-eof.txt').read()
    
    corpus = data.lower().split("
    ")
    
    tokenizer.fit_on_texts(corpus)
    
    total_words = len(tokenizer.word_index) + 1
    
    print(tokenizer.word_index)
    
    print(total_words)
    
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            # generate n_gram_sequence
            '''
              可以理解成将句子分割成一个递增的序列  
                a
                ab
                abc
                abcd
            '''
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
            
    # pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    '''
        填充完毕之后类似于以下效果
        0 0 0 0 0 0 0 0 4 2
        0 0 0 0 0 0 0 4 2 66
        0 0 0 0 0 0 4 2 66 8
        0 0 0 0 0 4 2 66 8 67
        我们将出最后一个token以外的值全部看做input(x),而把最后一个值看做label(y)
    '''
    input_sequences = np.array(pad_sequences(input_sequences, 
                                             maxlen=max_sequence_len, 
                                             padding='pre'))
    # create predictors and label
    xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
    # 将y分别并且变成one-hot编码
    ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
    
    print(xs[6])
    print(ys[6])
    print(xs[5])
    print(ys[5])
    
    print(tokenizer.word_index)
    
    
    model = Sequential()
    # total_words 语料库单词数 input_length=max_sequence_len-1是因为我们将每个序列的最后一个值设定成了label
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(total_words, activation='softmax'))
    adam = Adam(lr=0.01)
    # 设置分类损失函数
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    # earlystop = EarlyStopping(monitor = 'val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
    history = model.fit(xs, ys, epochs=100, verbose=1)
    # print model.summmary()
    print(model)
    
    
    import matplotlib.pyplot as plt
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.show()
     
    plot_graphs(history, 'accuracy')
    
    
    # test
    seed_text = "I've got a bad feeling about this"
    next_words = 100
      
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # predicted = model.predict_classes(token_list, verbose=0)
        predict_x=model.predict(token_list, verbose=0) 
        predicted=np.argmax(predict_x,axis=1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    print(seed_text)
    

    本文来自博客园,作者:甫生,转载请注明原文链接:https://www.cnblogs.com/fusheng-rextimmy/p/15404865.html

  • 相关阅读:
    scrapy爬虫框架
    MongoDB分组查询,聚合查询,以及复杂查询
    mac重启iterm后不会自动加载.bash_profile
    Linux 之 2>&1
    linux下利用nohup后台运行jar文件包程序
    【intellij idea】汇总
    JAVA虚拟机体系结构
    JDK > JRE > JVM
    依赖注入(IOC) 详解
    mysql 忘记密码 登陆+修改密码
  • 原文地址:https://www.cnblogs.com/fusheng-rextimmy/p/15404865.html
Copyright © 2011-2022 走看看