zoukankan      html  css  js  c++  java
  • TFIDF手写

    #coding=utf-8
    import numpy as np
    import jieba
    
    class TfIdf:
        def __init__(self,doc):
            self.doc = doc
            self.get_dic()
            
        def get_dic(self):
            stop_path = 'stop_word.txt'
            with open(stop_path,encoding="utf-8") as f:
                stop_dic = set(f.read().split("\n"))
            self.doc = [list(jieba.cut(sent)) for sent in self.doc]
            self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic])))
        
        def cal_tf(self):
            self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
        
        def cal_idf(self):
            self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
        
        def cal_tfidf(self):
            self.cal_tf()
            self.cal_idf()
            self.tfidf = self.tf*self.idf    
    
    if __name__=="__main__":
        doc = ['女排北京奥运会夺冠',
               '北京奥运会的羽毛球男单决赛',
               '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
        tf_idf = TfIdf(doc)
        tf_idf.cal_tfidf()
        print(tf_idf.tfidf)
  • 相关阅读:
    Bootstrap3.0学习第八轮
    内存管理相关的信息
    SVN merge
    Asp.Net MVC 3
    formValidator
    jquery 分页控件2
    从零开始学C++之STL(四):算法简介、7种算法分类
    (Java实现) 过河卒
    (Java实现) N皇后问题
    (Java实现) N皇后问题
  • 原文地址:https://www.cnblogs.com/xiaoruirui/p/15625061.html
Copyright © 2011-2022 走看看