zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:使用训练好的单词向量实现新闻摘要分类

    import pandas as pd
    df = pd.read_json('/Users/chenyi/Documents/News_Category_Dataset.json', lines=True)
    df.head()
    df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
    categories = df.groupby('category')
    print("total categories: ", categories.ngroups)
    print(categories.size())

    from keras.preprocessing import sequence
    from keras.preprocessing.text import Tokenizer, text_to_word_sequence, one_hot
    df['text'] = df.headline + " " + df.short_description
    
    # 将单词进行标号
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df.text)
    X = tokenizer.texts_to_sequences(df.text)
    df['words'] = X
    
    #记录每条数据的单词数
    df['word_length'] = df.words.apply(lambda i: len(i))
    #清除单词数不足5个的数据条目
    df = df[df.word_length >= 5]
    df.word_length.describe()

    maxlen = 50
    X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
    
    # 将分类进行编号
    categories = df.groupby('category').size().index.tolist()
    category_int = {}
    int_category = {}
    for i, k in enumerate(categories):
        category_int.update({k:i})
        int_category.update({i:k})
    
    df['c2id'] = df['category'].apply(lambda x: category_int[x])
    import numpy as np
    import keras.utils as utils
    from sklearn.model_selection import train_test_split
    import numpy as np
    
    
    
    X = np.array(X)
    Y = utils.to_categorical(list(df.c2id))
    
    
    # 将数据分成两部分,80%用于训练,20%用于测试
    
    seed = 29
    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
    word_index = tokenizer.word_index
    
    EMBEDDING_DIM = 100
    embeddings_index = {}
    f = open('/Users/chenyi/Documents/glove.6B/glove.6B.100d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    print('Total %s word vectors.' %len(embeddings_index))
    from keras.initializers import Constant
    
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        #根据单词挑选出对应向量
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
         
    embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, 
                               embeddings_initializer=Constant(embedding_matrix),
                               input_length = maxlen,
                                trainable=False
                               )
    model = Sequential()
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(layers.Dense(64, activation='relu'))
    #当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
    model.add(layers.Dense(len(int_category), activation='softmax'))
    
    #对于输出多个分类结果,最好的损失函数是categorical_crossentropy
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val), batch_size=512)

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    
    plt.title('Training and validation accuracy')
    plt.plot(epochs, acc, 'red', label='Training acc')
    plt.plot(epochs, val_acc, 'blue', label='Validation acc')
    plt.legend()
    
    plt.figure()
    plt.title('Training and validation loss')
    plt.plot(epochs, loss, 'red', label='Training loss')
    plt.plot(epochs, val_loss, 'blue', label='Validation loss')
    plt.legend()
    
    plt.show()

     

  • 相关阅读:
    七 、linux正则表达式
    六、通配符
    Codeforces1099D.Sum in the tree(贪心)
    叮,出现!
    Codeforces1056E.Check Transcription(枚举+Hash)
    2018.11.25 AMC-ICPC 亚洲区域赛(焦作站)吊银
    Gym101889J. Jumping frog(合数分解+环形dp预处理)
    Gym101889E. Enigma(bfs+数位)
    Gym101889B. Buggy ICPC(打表)
    Codeforces1076F. Summer Practice Report(贪心+动态规划)
  • 原文地址:https://www.cnblogs.com/tszr/p/12237900.html
Copyright © 2011-2022 走看看