理解sklearn.feature.text中的CountVectorizer和TfidfVectorizer

"""
理解sklearn中的CountVectorizer和TfidfVectorizer
"""
from collections import Counter

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentences = ["there is a dog dog", "here is a cat"]
count_vec = CountVectorizer()
a = count_vec.fit_transform(sentences)
print(a.toarray())
print(count_vec.vocabulary_)
"""
输出
{'dog': 1, 'there': 4, 'here': 2, 'cat': 0, 'is': 3}
表示每个词汇对应的坐标
"""

print("=" * 10)
tf_vec = TfidfVectorizer()
b = tf_vec.fit_transform(sentences)
print(b.toarray())
print(tf_vec.vocabulary_)
print(tf_vec.idf_)  # 逆文档频率
print(tf_vec.get_feature_names())


def mytf_idf(s):
    # 自己实现tfidf
    words = tf_vec.get_feature_names()
    tf_matrix = np.zeros((len(s), len(words)), dtype=np.float32)
    smooth = 1
    # 初始值加上平滑因子
    df_matrix = np.ones(len(words), dtype=np.float32) * smooth
    for i in range(len(s)):
        s_words = s[i].split()
        for j in range(len(words)):
            cnt = Counter(s_words).get(words[j], 0)
            tf_matrix[i][j] = cnt
            if cnt > 0:
                df_matrix[j] += 1
    # idf一定是大于1的数值
    idf_matrix = np.log((len(s) + smooth) / df_matrix) + 1
    matrix = tf_matrix * idf_matrix
    matrix = matrix / np.linalg.norm(matrix, 2, axis=1).reshape(matrix.shape[0], 1)
    print(matrix)


print("=" * 10)
mytf_idf(sentences)
"""
TODO:
* IDF可以学到，通过神经网络反向传播来学习IDF而不是直接计算得出
* CountVectorizer有时不需要考虑个数，只需要知道是否出现过即可
"""

查看全文

相关阅读:
虚拟化（五）：vsphere高可用群集与容错（存储DRS是一种可用于将多个数据存储作为单个数据存储群集进行管理的功能）
vmware 桌面虚拟化 horizon view 介绍（使用微软的RDP协议或vmware 专有的PCoIP协议，连接到虚拟桌面，并且可以使用本地的USB设备、本地存储）
Delphi之萝莉调教篇
 编写自定义PE结构的程序(如何手写一个PE，高级编译器都是编译好的PE头部,例如MASM,TASM等，NASM,FASM是低级编译器.可以自定义结构)
localStore的storage事件
 对称密码体制和非对称密码体制
 Span<T>和ValueTuple<T>性能是.Net Core非常关键的特性
 分布式高并发下Actor模型
 公众号及H5支付
 BIOS（Basic Input/Output System）是基本输入输出系统的简称

原文地址：https://www.cnblogs.com/weiyinfu/p/9558755.html