zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用训练好的单词向量实现新闻摘要分类

    import pandas as pd
    df = pd.read_json('/Users/chenyi/Documents/News_Category_Dataset.json', lines=True)
    df.head()
    df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
    categories = df.groupby('category')
    print("total categories: ", categories.ngroups)
    print(categories.size())

    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
    df['text'] = df.headline + " " + df.short_description
    
    # 将单词进行标号
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df.text)
    X = tokenizer.texts_to_sequences(df.text)
    df['words'] = X
    
    #记录每条数据的单词数
    df['word_length'] = df.words.apply(lambda i: len(i))
    #清除单词数不足5个的数据条目
    df = df[df.word_length >= 5]
    df.word_length.describe()

    maxlen = 50
    X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
    
    # 将分类进行编号
    categories = df.groupby('category').size().index.tolist()
    category_int = {}
    int_category = {}
    for i, k in enumerate(categories):
        category_int.update({k:i})
        int_category.update({i:k})
    
    df['c2id'] = df['category'].apply(lambda x: category_int[x])
    import numpy as np
    import keras.utils as utils
    from sklearn.model_selection import train_test_split
    import numpy as np
    
    
    
    X = np.array(X)
    Y = utils.to_categorical(list(df.c2id))
    
    
    # 将数据分成两部分,80%用于训练,20%用于测试
    
    seed = 29
    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
    word_index = tokenizer.word_index
    
    EMBEDDING_DIM = 100
    embeddings_index = {}
    f = open('/Users/chenyi/Documents/glove.6B/glove.6B.100d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    print('Total %s word vectors.' %len(embeddings_index))
    from keras.initializers import Constant
    
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        #根据单词挑选出对应向量
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
         
    embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, 
                               embeddings_initializer=Constant(embedding_matrix),
                               input_length = maxlen,
                                trainable=False
                               )
    model = Sequential()
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(layers.Dense(64, activation='relu'))
    #当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
    model.add(layers.Dense(len(int_category), activation='softmax'))
    
    #对于输出多个分类结果,最好的损失函数是categorical_crossentropy
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), batch_size=512)

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    
    plt.title('Training and validation accuracy')
    plt.plot(epochs, acc, 'red', label='Training acc')
    plt.plot(epochs, val_acc, 'blue', label='Validation acc')
    plt.legend()
    
    plt.figure()
    plt.title('Training and validation loss')
    plt.plot(epochs, loss, 'red', label='Training loss')
    plt.plot(epochs, val_loss, 'blue', label='Validation loss')
    plt.legend()
    
    plt.show()

     

  • 相关阅读:
    luogu 1865 数论 线性素数筛法
    洛谷 2921 记忆化搜索 tarjan 基环外向树
    洛谷 1052 dp 状态压缩
    洛谷 1156 dp
    洛谷 1063 dp 区间dp
    洛谷 2409 dp 月赛题目
    洛谷1199 简单博弈 贪心
    洛谷1417 烹调方案 dp 贪心
    洛谷1387 二维dp 不是特别简略的题解 智商题
    2016 10 28考试 dp 乱搞 树状数组
  • 原文地址:https://www.cnblogs.com/tszr/p/12237900.html
Copyright © 2011-2022 走看看