zoukankan      html  css  js  c++  java
  • tf-idf算法

    import numpy as np
    from collections import Counter
    import itertools
    import matplotlib.pyplot as plt
    docs = [
        "it is a good day, I like to stay here",
        "I am happy to be here",
        "I am bob",
        "it is sunny today",
        "I have a party today",
        "it is a dog and that is a cat",
        "there are dog and cat on the tree",
        "I study hard this morning",
        "today is a good day",
        "tomorrow will be a good day",
        "I like coffee, I like book and I like apple",
        "I do not like it",
        "I am kitty, I like bob",
        "I do not care who like bob, but I like kitty",
        "It is coffee time, bring your cup",
    ]
    docs_words=[d.replace(",","").split(" ") for d in docs]
    #itertools.chain(*iterables) 参数可以传入任意的序列,个数不限
    #set()函数创建一个无序不重复元素集
    #获取所有文档中的单词,并且不重复
    vocab=set(itertools.chain(*docs_words))
    #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
    v2i={v:i for i,v in enumerate(vocab)}
    #:items() 方法把字典中每对 key 和 value 组成一个元组,并把这些元组放在列表中返回。
    i2v={i:v for v,i in v2i.items()}
    
    def safe_log(x):
        mask=x!=0
        x[mask]=np.log(x[mask])
        return x
    
    # lambda 函数是匿名的:
    # 所谓匿名函数,通俗地说就是没有名字的函数。lambda函数没有名字。
    # lambda 函数有输入和输出:
    # 输入是传入到参数列表argument_list的值,输出是根据表达式expression计算得到的值。
    # lambda 函数拥有自己的命名空间:
    # 不能访问自己参数列表之外或全局命名空间里的参数,只能完成非常简单的功能。
    # lambda x, y: x*y			# 函数输入是x和y,输出是它们的积x*y
    
    # (axis=1)与(axis=0)区别
    # 使用0值表示沿着每一列或行标签索引值向下执行方法
    # 使用1值表示沿着每一行或者列标签模向执行对应的方法
    # 按行相加,并且(keepdims)保持其二维特性
    #print(np.sum(a, axis=1, keepdims=True))
    tf_methods={
             "log": lambda x: np.log(1+x),
            "augmented": lambda x: 0.5 + 0.5 * x / np.max(x, axis=1, keepdims=True),
            "boolean": lambda x: np.minimum(x, 1),
            "log_avg": lambda x: (1 + safe_log(x)) / (1 + safe_log(np.mean(x, axis=1, keepdims=True))),
    }
    
    
    idf_methods = {
            "log": lambda x: 1 + np.log(len(docs) / (x+1)),
            "prob": lambda x: np.maximum(0, np.log((len(docs) - x) / (x+1))),
            "len_norm": lambda x: x / (np.sum(np.square(x))+1),
        }
    # word_counts = Counter(words)
    # # 出现频率最高的3个单词
    # top_three = word_counts.most_common(3)
    # print(top_three)
    # [('eyes', 8), ('the', 5), ('look', 4)]
    def get_tf(method="log"):
        # term frequency: how frequent a word appears in a doc
        _tf = np.zeros((len(vocab), len(docs)), dtype=np.float64)    # [n_vocab, n_doc]
        for i, d in enumerate(docs_words):
            counter = Counter(d)
            for v in counter.keys():
                _tf[v2i[v], i] = counter[v] / counter.most_common(1)[0][1]
    
        weighted_tf = tf_methods.get(method, None)
        if weighted_tf is None:
            raise ValueError
        return weighted_tf(_tf)
    
    
    def get_idf(method="log"):
        # inverse document frequency: low idf for a word appears in more docs, mean less important
        df = np.zeros((len(i2v), 1))
        for i in range(len(i2v)):
            d_count = 0
            for d in docs_words:
                d_count += 1 if i2v[i] in d else 0
            df[i, 0] = d_count
    
        idf_fn = idf_methods.get(method, None)
        if idf_fn is None:
            raise ValueError
        #如果包含词条t的文档越少, IDF越大,则说明词条具有很好的类别区分能力
        return idf_fn(df)
    
    
    def cosine_similarity(q, _tf_idf):
        unit_q = q / np.sqrt(np.sum(np.square(q), axis=0, keepdims=True))
        unit_ds = _tf_idf / np.sqrt(np.sum(np.square(_tf_idf), axis=0, keepdims=True))
        similarity = unit_ds.T.dot(unit_q).ravel()
        return similarity
    
    
    def docs_score(q, len_norm=False):
        q_words = q.replace(",", "").split(" ")
    
        # add unknown words
        unknown_v = 0
        for v in set(q_words):
            if v not in v2i:
                v2i[v] = len(v2i)
                i2v[len(v2i)-1] = v
                unknown_v += 1
        if unknown_v > 0:
            _idf = np.concatenate((idf, np.zeros((unknown_v, 1), dtype=np.float)), axis=0)
            _tf_idf = np.concatenate((tf_idf, np.zeros((unknown_v, tf_idf.shape[1]), dtype=np.float)), axis=0)
        else:
            _idf, _tf_idf = idf, tf_idf
        counter = Counter(q_words)
        q_tf = np.zeros((len(_idf), 1), dtype=np.float)     # [n_vocab, 1]
        for v in counter.keys():
            q_tf[v2i[v], 0] = counter[v]
    
        q_vec = q_tf * _idf            # [n_vocab, 1]
        print(q_vec.shape)
        print(_tf_idf.shape)
    
        q_scores = cosine_similarity(q_vec, _tf_idf)
        if len_norm:
            len_docs = [len(d) for d in docs_words]
            q_scores = q_scores / np.array(len_docs)
        print(q_scores.shape)
        return q_scores
    
    
    def get_keywords(n=2):
        for c in range(3):
            col = tf_idf[:, c]
            idx = np.argsort(col)[-n:]
            print("doc{}, top{} keywords {}".format(c, n, [i2v[i] for i in idx]))
    
    
    tf = get_tf()           # [n_vocab, n_doc]
    idf = get_idf()         # [n_vocab, 1]
    tf_idf = tf * idf       # [n_vocab, n_doc]
    # print("tf shape(vecb in each docs): ", tf.shape)
    # print("
    tf samples:
    ", tf[:2])
    # print("
    idf shape(vecb in all docs): ", idf.shape)
    # print("
    idf samples:
    ", idf[:2])
    # print("
    tf_idf shape: ", tf_idf.shape)
    # print("
    tf_idf sample:
    ", tf_idf[:2])
    
    
    # test
    get_keywords()
    q = "I get a coffee cup"
    scores = docs_score(q)
    print(scores)
    #argsort将数组x中的元素从小到大排序
    d_ids = scores.argsort()[-3:][::-1]
    print("
    top 3 docs for '{}':
    {}".format(q, [docs[i] for i in d_ids]))
    

      用tf-idf算法找到与一个文档相似的其他文档。首先要统计出这些文档中出现的所有词,计算每一个文档中词的tf值,tf是用一个文档中出现词w的个数初一文档的总次数,除以总词数是为了进行归一化处理。之后计算idf值,用文档的总数除以包含该词的文档数,最后对得到的商取对数,如果包含词的文档越少,idf值就越大,说明该词有很好的分辨能力。

  • 相关阅读:
    消息中间件(一)MQ详解及四大MQ比较
    SIP协议
    PAT (Basic Level) Practice 1008 数组元素循环右移问题
    LeetCode-Algorithms 1. 两数之和
    PAT (Basic Level) Practice 1040 有几个PAT
    PAT (Basic Level) Practice 1023 组个最小数
    PAT (Basic Level) Practice 1021 个位数统计
    PAT (Basic Level) Practice 1007 素数对猜想
    PAT (Basic Level) Practice 1006 换个格式输出整数
    PAT (Basic Level) Practice 1004 成绩排名
  • 原文地址:https://www.cnblogs.com/zhang12345/p/15322051.html
Copyright © 2011-2022 走看看