#coding=utf-8 import numpy as np import jieba class TfIdf: def __init__(self,doc): self.doc = doc self.get_dic() def get_dic(self): stop_path = 'stop_word.txt' with open(stop_path,encoding="utf-8") as f: stop_dic = set(f.read().split("\n")) self.doc = [list(jieba.cut(sent)) for sent in self.doc] self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic]))) def cal_tf(self): self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc]) def cal_idf(self): self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic]) def cal_tfidf(self): self.cal_tf() self.cal_idf() self.tfidf = self.tf*self.idf if __name__=="__main__": doc = ['女排北京奥运会夺冠', '北京奥运会的羽毛球男单决赛', '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排'] tf_idf = TfIdf(doc) tf_idf.cal_tfidf() print(tf_idf.tfidf)