TFIDF手写

#coding=utf-8
import numpy as np
import jieba

class TfIdf:
    def __init__(self,doc):
        self.doc = doc
        self.get_dic()
        
    def get_dic(self):
        stop_path = 'stop_word.txt'
        with open(stop_path,encoding="utf-8") as f:
            stop_dic = set(f.read().split("\n"))
        self.doc = [list(jieba.cut(sent)) for sent in self.doc]
        self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic])))
    
    def cal_tf(self):
        self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
    
    def cal_idf(self):
        self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
    
    def cal_tfidf(self):
        self.cal_tf()
        self.cal_idf()
        self.tfidf = self.tf*self.idf    

if __name__=="__main__":
    doc = ['女排北京奥运会夺冠',
           '北京奥运会的羽毛球男单决赛',
           '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
    tf_idf = TfIdf(doc)
    tf_idf.cal_tfidf()
    print(tf_idf.tfidf)

查看全文

相关阅读:
iOS开发CoreAnimation解读之三——几种常用Layer的使用解析
 iOS开发CoreAnimation解读之二——对CALayer的分析
 iOS开发CoreAnimation解读之一——初识CoreAnimation核心动画编程
 在最完整的搜索提示降史上的用户交互的研究——阅读《An Eye-tracking Study of User Interactions with Query Auto Completion》
学习算法
 This Android SDK requires Android Developer Toolkit version 22.6.2 or above.
一切都不是为了营销手段的目的都是耍流氓
 LeetCode219:Contains Duplicate II
无尽的循环ViewPager
允许Ubuntu14.04"保存"屏幕亮度值

原文地址：https://www.cnblogs.com/xiaoruirui/p/15625061.html