zoukankan      html  css  js  c++  java
  • 理解sklearn.feature.text中的CountVectorizer和TfidfVectorizer

    """
    理解sklearn中的CountVectorizer和TfidfVectorizer
    """
    from collections import Counter
    
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    sentences = ["there is a dog dog", "here is a cat"]
    count_vec = CountVectorizer()
    a = count_vec.fit_transform(sentences)
    print(a.toarray())
    print(count_vec.vocabulary_)
    """
    输出
    {'dog': 1, 'there': 4, 'here': 2, 'cat': 0, 'is': 3}
    表示每个词汇对应的坐标
    """
    
    print("=" * 10)
    tf_vec = TfidfVectorizer()
    b = tf_vec.fit_transform(sentences)
    print(b.toarray())
    print(tf_vec.vocabulary_)
    print(tf_vec.idf_)  # 逆文档频率
    print(tf_vec.get_feature_names())
    
    
    def mytf_idf(s):
        # 自己实现tfidf
        words = tf_vec.get_feature_names()
        tf_matrix = np.zeros((len(s), len(words)), dtype=np.float32)
        smooth = 1
        # 初始值加上平滑因子
        df_matrix = np.ones(len(words), dtype=np.float32) * smooth
        for i in range(len(s)):
            s_words = s[i].split()
            for j in range(len(words)):
                cnt = Counter(s_words).get(words[j], 0)
                tf_matrix[i][j] = cnt
                if cnt > 0:
                    df_matrix[j] += 1
        # idf一定是大于1的数值
        idf_matrix = np.log((len(s) + smooth) / df_matrix) + 1
        matrix = tf_matrix * idf_matrix
        matrix = matrix / np.linalg.norm(matrix, 2, axis=1).reshape(matrix.shape[0], 1)
        print(matrix)
    
    
    print("=" * 10)
    mytf_idf(sentences)
    """
    TODO:
    * IDF可以学到,通过神经网络反向传播来学习IDF而不是直接计算得出
    * CountVectorizer有时不需要考虑个数,只需要知道是否出现过即可
    """
    
    
  • 相关阅读:
    常见模块和包
    二分查找算法
    常见内置函数
    Django总目录
    nginx配置站点
    Arduino语言
    Python连接Arduino的方法
    机器人学习
    Redis
    arduino总目录
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/9558755.html
Copyright © 2011-2022 走看看