zoukankan      html  css  js  c++  java
  • TFIDF代码实现

     1 #!/usr/bin/env python 
     2 # encoding: utf-8 
     3 
     4 """
     5 @author: zkjiang
     6 @site: https://www.github.com
     7 @software: PyCharm
     8 @file: TFIDF.py
     9 @time: 2019/2/2 12:33
    10 """
    11 
    12 import numpy as np
    13 
    14 class TFIDF(object):
    15 
    16     """
    17     手写一个TFIDF统计类,只写最简单的一个实现
    18     """
    19 
    20     def __init__(self, corpus):
    21         """
    22         初始化
    23         self.vob:词汇个数统计,dict格式
    24         self.word_id:词汇编码id,dict格式
    25         self.smooth_idf:平滑系数,关于平滑不多解释了
    26         :param corpus:输入的语料
    27         """
    28         self.word_id = {}
    29         self.vob = {}
    30         self.corpus = corpus
    31         self.smooth_idf = 0.01
    32 
    33     def fit_transform(self, corpus):
    34         pass
    35 
    36     def get_vob_fre(self):
    37         """
    38         计算文本特特征的出现次数,也就是文本频率term frequency,但是没有除token总数,因为后面bincount计算不支持float
    39         :return: 修改self.vob也就是修改词频统计字典
    40         """
    41         # 统计各词出现个数
    42         id = 0
    43         for single_corpus in self.corpus:
    44             if isinstance(single_corpus, list):
    45                 pass
    46             if isinstance(single_corpus, str):
    47                 single_corpus = single_corpus.strip("
    ").split(" ")
    48             for word in single_corpus:
    49                 if word not in self.vob:
    50                     self.vob[word] = 1
    51                     self.word_id[word] = id
    52                     id += 1
    53                 else:
    54                     self.vob[word] += 1
    55 
    56         # 生成矩阵
    57         X = np.zeros((len(self.corpus), len(self.vob)))
    58         for i in range(len(self.corpus)):
    59             if isinstance(self.corpus[i], str):
    60                 single_corpus = self.corpus[i].strip("
    ").split(" ")
    61             else:
    62                 single_corpus = self.corpus[i]
    63             for j in range(len(single_corpus)):
    64                 feature = single_corpus[j]
    65                 feature_id = self.word_id[feature]
    66                 X[i, feature_id] = self.vob[feature]
    67         return X.astype(int)  # 需要转化成int
    68 
    69 
    70     def get_tf_idf(self):
    71         """
    72         计算idf并生成最后的TFIDF矩阵
    73         :return:
    74         """
    75         X = self.get_vob_fre()
    76         n_samples, n_features = X.shape
    77         df = []
    78         for i in range(n_features):
    79             """
    80             这里是统计每个特征的非0的数量,也就是逆文档频率指数的分式中的分母,是为了计算idf
    81             """
    82             df.append(n_samples - np.bincount(X[:,i])[0])
    83         df = np.array(df)
    84         # perform idf smoothing if required
    85         df += int(self.smooth_idf)
    86         n_samples += int(self.smooth_idf)
    87         idf = np.log(n_samples / df) + 1  # 核心公式
    88         # print(self.vob)
    89         # print(self.word_id)
    90         return X*idf/len(self.vob)
    91 
    92 
    93 
    94 if __name__ == '__main__':
    95     corpus = [["","a","e"],["","a","c"],["","a","b"]]
    96     test = TFIDF(corpus)
    97     # print(test.get_vob_fre())
    98     print(test.get_tf_idf())
  • 相关阅读:
    java8知识总结_2.方法引用
    Shell三剑客_1.grep
    java8知识总结_1.Lambda表达式
    javascript中的设计模式
    javascript入门学习
    css3新特性
    Html5新增了什么
    什么是Node.js
    vue项目搭建
    Git使用
  • 原文地址:https://www.cnblogs.com/smartisn/p/12459801.html
Copyright © 2011-2022 走看看