class TFIDF(object): """ 以一个图书馆为例, tf: 该单词在图书馆某本书里出现的频率 idf: 1+log((图书馆所有书的数量+平滑系数)/(该单词出现过的书的数量+平滑系数)) tfidf = tf*idf,即对应该本书该词的tfidf值 """ def __init__(self, corpus_, stop_words, word_sep=' ', smooth_value=0.01): assert isinstance(corpus_, list), 'Not support this type corpus.' self.corpus = corpus_ self.vob = defaultdict(int) self.word_sep = word_sep self.smooth_value = smooth_value self.doc_cnt = defaultdict(set) self.word_unq = set() self.stop_words = stop_words def get_tf_idf(self): filter_corpus = [] for i, line in enumerate(self.corpus): if isinstance(line, str): line = line.split(self.word_sep) line = [i for i in line if i not in self.stop_words] filter_corpus.append(line) for w in line: self.vob[f'{i}_{w}'] += 1 self.doc_cnt[w].add(i) self.word_unq.add(w) key_values = dict(zip(range(len(self.word_unq)), self.word_unq)) output = np.zeros((len(self.corpus), len(self.word_unq))) for i, line in enumerate(filter_corpus): tmp_size = len(line) for j in range(output.shape[1]): w = key_values[j] w_ = f'{i}_{w}' if w in line: output[i, j] = self.vob[w_]/tmp_size*(1+np.log((output.shape[0]+self.smooth_value)/(self.smooth_value+len(self.doc_cnt[w])))) return output if __name__ == '__main__': # 每个列表类比为一本书 corpus = [['this', 'is', 'a', 'simple', 'tfidf', 'code', 'but', 'code', 'might', 'has', 'bugs'], ['python', 'is', 'a', 'code', 'language', 'not', 'human', 'language'], ['learning', 'python', 'make', 'things', 'simple', 'but', 'not', 'simple', 'enough']] result = TFIDF(corpus, stop_words=['a'], smooth_value=1) print(result.get_tf_idf())