zoukankan      html  css  js  c++  java
  • word2vector(含code)

    Word2Vec其实就是通过学习文本来用词向量的方式表征词的语义信息,即通过一个嵌入空间使得语义上相似的单词在该空间内距离很近。

    Embedding其实就是一个映射,将单词从原先所属的空间映射到新的多维空间中,也就是把原先词所在空间嵌入到一个新的空间中去。

    Word2Vec模型实际上分为了两个部分,第一部分为建立模型,第二部分是通过模型获取嵌入词向量。Word2Vec的整个建模过程实际上与自编码器(auto-encoder)的思想很相似,即先基于训练数据构建一个神经网络,当这个模型训练好以后,我们并不会用这个训练好的模型处理新的任务,我们真正需要的是这个模型通过训练数据所学得的参数,例如隐层的权重矩阵——后面我们将会看到这些权重在Word2Vec中实际上就是我们试图去学习的“word vectors”。基于训练数据建模的过程,我们给它一个名字叫“Fake Task”,意味着建模并不是我们最终的目的。

    上面提到的这种方法实际上会在无监督特征学习(unsupervised feature learning)中见到,最常见的就是自编码器(auto-encoder):通过在隐层将输入进行编码压缩,继而在输出层将数据解码恢复初始状态,训练完成后,我们会将输出层“砍掉”,仅保留隐层。

    https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html

    基于Python版本的实现:
    import math
    import sys
    import numpy as np

    class Ngram:
    def init(self, tokens):
    self.tokens = tokens
    self.count = 0
    self.score = 0.0

    def set_score(self, score):
        self.score = score
    
    def get_string(self):
        return '_'.join(self.tokens)
    

    class Corpus: #语料库
    def init(self, filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, word_phrase_filename):
    i = 0
    file_pointer = open(filename, 'r')

        all_tokens = []
    
        for line in file_pointer:
            line_tokens = line.split()
            for token in line_tokens:
                token = token.lower() #大写转小写
    
                if len(token) > 1 and token.isalnum():  # isalnum() 方法检测字符串是否由字母和数字组成
                    all_tokens.append(token)
    
                i += 1
                if i % 10000 == 0:
                    sys.stdout.flush() #刷新输出
                    sys.stdout.write("
    Reading corpus: %d" % i)
    
        sys.stdout.flush()
        print( "
    Corpus read: %d" % i)
    
        file_pointer.close()
    
        self.tokens = all_tokens
    
        for x in range(1, word_phrase_passes + 1):
            self.build_ngrams(x, word_phrase_delta, word_phrase_threshold, word_phrase_filename)
    
        self.save_to_file(filename)
    
    def build_ngrams(self, x, word_phrase_delta, word_phrase_threshold, word_phrase_filename):
    
        ngrams = []
        ngram_map = {}
    
        token_count_map = {}
        for token in self.tokens:
            if token not in token_count_map:
                token_count_map[token] = 1
            else:
                token_count_map[token] += 1
    
        i = 0
        ngram_l = []
        for token in self.tokens:
    
            if len(ngram_l) == 2:
                ngram_l.pop(0)
    
            ngram_l.append(token)
            ngram_t = tuple(ngram_l)
    
            if ngram_t not in ngram_map:
                ngram_map[ngram_t] = len(ngrams)
                ngrams.append(Ngram(ngram_t))
    
            ngrams[ngram_map[ngram_t]].count += 1
    
            i += 1
            if i % 10000 == 0:
                sys.stdout.flush()
                sys.stdout.write("
    Building n-grams (%d pass): %d" % (x, i))
    
        sys.stdout.flush()
        print( "
    n-grams (%d pass) built: %d" % (x, i))
    
        filtered_ngrams_map = {}
        file_pointer = open(word_phrase_filename + ('-%d' % x), 'w')
    
        # http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf
        i = 0
        for ngram in ngrams:
            product = 1
            for word_string in ngram.tokens:
                product *= token_count_map[word_string]
            ngram.set_score((float(ngram.count) - word_phrase_delta) / float(product))
    
            if ngram.score > word_phrase_threshold:
                filtered_ngrams_map[ngram.get_string()] = ngram
                file_pointer.write('%s %d
    ' % (ngram.get_string(), ngram.count))
    
            i += 1
            if i % 10000 == 0:
                sys.stdout.flush()
                sys.stdout.write("
    Scoring n-grams: %d" % i)
    
        sys.stdout.flush()
        print( "
    Scored n-grams: %d, filtered n-grams: %d" % (i, len(filtered_ngrams_map)))
        file_pointer.close()
    
        # Combining the tokens
        all_tokens = []
        i = 0
    
        while i < len(self.tokens):
    
            if i + 1 < len(self.tokens):
                ngram_l = []
                ngram_l.append(self.tokens[i])
                ngram_l.append(self.tokens[i+1])
                ngram_string = '_'.join(ngram_l)
    
                if len(ngram_l) == 2 and (ngram_string in filtered_ngrams_map):
                    ngram = filtered_ngrams_map[ngram_string]
                    all_tokens.append(ngram.get_string())
                    i += 2
                else:
                    all_tokens.append(self.tokens[i])
                    i += 1
            else:
                all_tokens.append(self.tokens[i])
                i += 1
    
        print("Tokens combined")
    
        self.tokens = all_tokens
    
    def save_to_file(self, filename):
    
        i = 1
    
        filepointer = open('preprocessed-' + filename, 'w')
        line = ''
        for token in self.tokens:
            if i % 20 == 0:
                line += token
                filepointer.write('%s
    ' % line)
                line = ''
            else:
                line += token + ' '
            i += 1
    
            if i % 10000 == 0:
                sys.stdout.flush()
                sys.stdout.write("
    Writing to preprocessed input file")
    
        sys.stdout.flush()
        print ("
    Preprocessed input file written")
    
        filepointer.close()
    
    
    def __getitem__(self, i):
        return self.tokens[i]
    
    def __len__(self):
        return len(self.tokens)
    
    def __iter__(self):
        return iter(self.tokens)
    

    class Word:
    def init(self, word):
    self.word = word
    self.count = 0

    class Vocabulary:
    def init(self, corpus, min_count):
    self.words = []
    self.word_map = {}
    self.build_words(corpus, min_count)

        self.filter_for_rare_and_common()
    
    def build_words(self, corpus, min_count):
        words = []
        word_map = {}
    
        i = 0
        for token in corpus:
            if token not in word_map:
                word_map[token] = len(words)
                words.append(Word(token))
            words[word_map[token]].count += 1
    
            i += 1
            if i % 10000 == 0:
                sys.stdout.flush()
                sys.stdout.write("
    Building vocabulary: %d" % len(words))
    
        sys.stdout.flush()
        print("
    Vocabulary built: %d" % len(words))
    
        self.words = words
        self.word_map = word_map # Mapping from each token to its index in vocab
    
    def __getitem__(self, i):
        return self.words[i]
    
    def __len__(self):
        return len(self.words)
    
    def __iter__(self):
        return iter(self.words)
    
    def __contains__(self, key):
        return key in self.word_map
    
    def indices(self, tokens):
        return [self.word_map[token] if token in self else self.word_map['{rare}'] for token in tokens]
    
    def filter_for_rare_and_common(self):
        # Remove rare words and sort
        tmp = []
        tmp.append(Word('{rare}'))
        unk_hash = 0
    
        count_unk = 0
        for token in self.words:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
            else:
                tmp.append(token)
    
        tmp.sort(key=lambda token : token.count, reverse=True)
    
        # Update word_map
        word_map = {}
        for i, token in enumerate(tmp):
            word_map[token.word] = i
    
        self.words = tmp
        self.word_map = word_map
        pass
    

    class TableForNegativeSamples:
    def init(self, vocab):
    power = 0.75
    norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constants

        table_size = int(1e6)
        table = np.zeros(table_size, dtype=np.uint32)
    
        p = 0 # Cumulative probability
        i = 0
        for j, word in enumerate(vocab):
            p += float(math.pow(word.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table
    
    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]
    

    def sigmoid(z):
    if z > 6:
    return 1.0
    elif z < -6:
    return 0.0
    else:
    return 1 / (1 + math.exp(-z))

    def save(vocab, nn0, filename):
    file_pointer = open(filename, 'w')
    for token, vector in zip(vocab, nn0):
    word = token.word.replace(' ', '_')
    vector_str = ' '.join([str(s) for s in vector])
    file_pointer.write('%s %s ' % (word, vector_str))
    file_pointer.close()

    if name == 'main':

    for input_filename in ['in.txt']:
    #for input_filename in ['news-2012-phrases-10000.txt']:
    
        # Number of negative examples
        k_negative_sampling = 5
    
        # Min count for words to be used in the model, else {rare}
        min_count = 3
    
        # Number of word phrase passes
        word_phrase_passes = 3 # 3
    
        # min count for word phrase formula
        word_phrase_delta = 3 # 5
    
        # Threshold for word phrase creation
        word_phrase_threshold = 1e-4
    
        # Read the corpus 读取语料库
        corpus = Corpus(input_filename, word_phrase_passes, word_phrase_delta, word_phrase_threshold, 'phrases-%s' % input_filename)
    
        # Read train file to init vocab读取训练文件初始化vocab
        vocab = Vocabulary(corpus, min_count)
        table = TableForNegativeSamples(vocab)
    
        # Max window length
        for window in [5]: # 5 for large set
    
            # Dimensionality of word embeddings
            for dim in [100]: # 100
    
                print( "Training: %s-%d-%d-%d" % (input_filename, window, dim, word_phrase_passes))
    
                # Initialize network
                nn0 = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(len(vocab), dim))
                nn1 = np.zeros(shape=(len(vocab), dim))
    
                # Initial learning rate
                initial_alpha = 0.01 # 0.01
    
                # Modified in loop
                global_word_count = 0
                alpha = initial_alpha
                word_count = 0
                last_word_count = 0
    
                tokens = vocab.indices(corpus)
    
                for token_idx, token in enumerate(tokens):
                    if word_count % 10000 == 0:
                        global_word_count += (word_count - last_word_count)
                        last_word_count = word_count
    
                        # Recalculate alpha
                        # alpha = initial_alpha * (1 - float(global_word_count) / len(corpus))
                        # if alpha < initial_alpha * 0.0001:
                        #     alpha = initial_alpha * 0.0001
    
                        sys.stdout.flush()
                        sys.stdout.write("
    Training: %d of %d" % (global_word_count, len(corpus)))
    
                    # Randomize window size, where win is the max window size
                    current_window = np.random.randint(low=1, high=window+1)
                    context_start = max(token_idx - current_window, 0)
                    context_end = min(token_idx + current_window + 1, len(tokens))
                    context = tokens[context_start:token_idx] + tokens[token_idx+1:context_end] # Turn into an iterator?
    
                    for context_word in context:
                        # Init neu1e with zeros
                        neu1e = np.zeros(dim)
                        classifiers = [(token, 1)] + [(target, 0) for target in table.sample(k_negative_sampling)]
                        for target, label in classifiers:
                            z = np.dot(nn0[context_word], nn1[target])
                            p = sigmoid(z)
                            g = alpha * (label - p)
                            neu1e += g * nn1[target]              # Error to backpropagate to nn0
                            nn1[target] += g * nn0[context_word]  # Update nn1
    
                        # Update nn0
                        nn0[context_word] += neu1e
    
                    word_count += 1
    
                global_word_count += (word_count - last_word_count)
                sys.stdout.flush()
                print("
    Training finished: %d" % global_word_count)
    
                # Save model to file
                save(vocab, nn0, 'output-%s-%d-%d-%d' % (input_filename, window, dim, word_phrase_passes))
    

    基于tensorflow版本的实现

    import time
    import numpy as np
    import tensorflow as tf
    import random
    from collections import Counter

    主要包括以下四个部分的代码:

    数据预处理:替换文本中特殊符号并去除低频词;对文本分词;构建语料;单词映射表

    训练样本构建

    模型构建

    模型验证

    首先加载数据

    with open('text8') as f:
    text = f.read()

    定义函数来完成数据的预处理

    def preprocess(text, freq=5):
    '''
    对文本进行预处理

    参数
    ---
    text: 文本数据
    freq: 词频阈值
    '''
    # 对文本中的符号进行替换
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('
    ', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    
    # 删除低频词,减少噪音影响
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > freq]
    
    return trimmed_words
    

    清洗文本并分词

    words = preprocess(text)
    print(words[:20])

    构建映射表

    vocab = set(words)
    vocab_to_int = {w: c for c, w in enumerate(vocab)}
    int_to_vocab = {c: w for c, w in enumerate(vocab)}

    enumerate()是用来遍历一个可迭代容器中的元素,同时通过一个计数器变量记录当前元素所对应的索引值。

    print("total words: {}".format(len(words)))
    print("unique words: {}".format(len(set(words))))

    整个文本中单词大约为1660万规模,词典大小为6万左右

    训练样本构建

    skip-gram中,训练样本的形式是(input word, output word),其中output word是input word的上下文。

    为了减少模型噪音并加速训练速度,我们在构造batch之前要对样本进行采样,剔除停用词等噪音因素。

    采样:对样本进行抽样,剔除高频的停用词来减少模型的噪音,并加速训练。

    对原文本进行vocab到int的转换

    int_words = [vocab_to_int[w] for w in words]

    t = 1e-5 # t值
    threshold = 0.8 # 剔除概率阈值

    统计单词出现频次

    int_word_counts = Counter(int_words)
    total_count = len(int_words)

    计算单词频率

    word_freqs = {w: c/total_count for w, c in int_word_counts.items()}

    计算被删除的概率

    prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}

    对单词进行采样

    train_words = [w for w in int_words if prob_drop[w] < threshold]

    print(len(train_words))

    构建batch

    Skip-Gram模型是通过输入词来预测上下文。

    对于一个给定词,离它越近的词可能与它越相关,离它越远的词越不相关,这里我们设置窗口大小为5,对于每个训练单词,我们还会在[1:5]之间随机生成一个整数R,

    用R作为我们最终选择output word的窗口大小。这里之所以多加了一步随机数的窗口重新选择步骤,是为了能够让模型更聚焦于当前input word的邻近词。

    def get_targets(words, idx, window_size=5):
    '''
    获得input word的上下文单词列表

    参数
    ---
    words: 单词列表
    idx: input word的索引号
    window_size: 窗口大小
    '''
    target_window = np.random.randint(1, window_size + 1)
    # 这里要考虑input word前面单词不够的情况
    start_point = idx - target_window if (idx - target_window) > 0 else 0
    end_point = idx + target_window
    # output words(即窗口中的上下文单词)
    targets = set(words[start_point: idx] + words[idx + 1: end_point + 1])
    return list(targets)
    

    def get_batches(words, batch_size, window_size=5):
    '''
    构造一个获取batch的生成器
    '''
    n_batches = len(words) // batch_size

    # 仅取full batches
    words = words[:n_batches * batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx: idx + batch_size]
        for i in range(len(batch)):
            batch_x = batch[i]
            batch_y = get_targets(batch, i, window_size)
            # 由于一个input word会对应多个output word,因此需要长度统一
            x.extend([batch_x] * len(batch_y))
            y.extend(batch_y)
        yield x, y
    

    构建网络

    该部分包括:输入层,嵌入,负采样

    train_graph = tf.Graph()
    with train_graph.as_default():
    inputs = tf.placeholder(tf.int32, shape=[None], name='inputs')
    labels = tf.placeholder(tf.int32, shape=[None, None], name='labels')

    # 嵌入
    # 嵌入矩阵的矩阵形状为  vocab_size*hidden_units_size
    vocab_size = len(int_to_vocab)
    embedding_size = 200  # 嵌入维度
    

    with train_graph.as_default():
    # 嵌入层权重矩阵
    embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))#tf.random_uniform 从均匀分布中输出随机值
    # 实现lookup
    embed = tf.nn.embedding_lookup(embedding, inputs)
    #tf.nn.embedding_lookup函数的用法主要是:选取一个张量里面索引对应的元素。
    # tf.nn.embedding_lookup(tensor, id):tensor就是输入张量,id就是张量对应的索引,

    负采样:负采样主要是为了解决梯度下降计算速度慢的问题

    # ensorFlow中的tf.nn.sampled_softmax_loss会在softmax层上进行采样计算损失,计算出的loss要比full softmax loss低。
    n_sampled = 100
    

    with train_graph.as_default():
    softmax_w = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(vocab_size))

    # 计算negative sampling下的损失
    loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, vocab_size)
    
    cost = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    

    模型验证

    with train_graph.as_default():
    # 随机挑选一些单词
    valid_size = 16
    valid_window = 100
    # 从不同位置各选8个单词
    valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
    valid_examples = np.append(valid_examples,
    random.sample(range(1000, 1000 + valid_window), valid_size // 2))

    valid_size = len(valid_examples)
    # 验证单词集
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # 计算每个词向量的模并进行单位化
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    normalized_embedding = embedding / norm
    # 查找验证单词的词向量
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    # 计算余弦相似度
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))
    
    epochs = 10  # 迭代轮数
    batch_size = 1000  # batch大小
    window_size = 10  # 窗口大小
    

    with train_graph.as_default():
    saver = tf.train.Saver() # 文件存储

    with tf.Session(graph=train_graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())

    for e in range(1, epochs + 1):
        batches = get_batches(train_words, batch_size, window_size)
        start = time.time()
        #
        for x, y in batches:
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
    
            loss += train_loss
    
            if iteration % 100 == 0:
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Avg. Training loss: {:.4f}".format(loss / 100),
                      "{:.4f} sec/batch".format((end - start) / 100))
                loss = 0
                start = time.time()
    
            # 计算相似的词
            if iteration % 1000 == 0:
                # 计算similarity
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = int_to_vocab[valid_examples[i]]
                    top_k = 8  # 取最相似单词的前8个
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log = 'Nearest to [%s]:' % valid_word
                    for k in range(top_k):
                        close_word = int_to_vocab[nearest[k]]
                        log = '%s %s,' % (log, close_word)
                    print(log)
    
            iteration += 1
    

    save_path = saver.save(sess, "checkpoints/text8.ckpt")
    embed_mat = sess.run(normalized_embedding)

    %matplotlib inline

    %config InlineBackend.figure_format = 'retina'

    import matplotlib.pyplot as plt
    from sklearn.manifold import TSNE

    viz_words = 500
    tsne = TSNE()
    embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :])

    fig, ax = plt.subplots(figsize=(14, 14))
    for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)

  • 相关阅读:
    UICollectionView下拉使header放大模糊
    odoo13下searchpanel进行扩展.
    (原创)odoo关系字段在视图中的行为控制 总结
    (原创)odoo解决方案---接收以及回复外部邮件
    (原创)odoo11配置邮件功能的那些事儿
    入坑winpdb-1.4.8
    Python的hasattr() getattr() setattr() 函数使用方法详解
    jQuery webcam plugin
    (原创)odoo在docker环境下无法备份
    (转)PostgreSQL pg_dump&psql 数据的备份与恢复
  • 原文地址:https://www.cnblogs.com/Ann21/p/11313830.html
Copyright © 2011-2022 走看看