zoukankan      html  css  js  c++  java
  • tfidf代码简单实现

    class TFIDF(object):
        """
        以一个图书馆为例,
        tf: 该单词在图书馆某本书里出现的频率
        idf: 1+log((图书馆所有书的数量+平滑系数)/(该单词出现过的书的数量+平滑系数))
        tfidf = tf*idf,即对应该本书该词的tfidf值
        """
        def __init__(self, corpus_,  stop_words, word_sep=' ', smooth_value=0.01):
            assert isinstance(corpus_, list), 'Not support this type corpus.'
            self.corpus = corpus_
            self.vob = defaultdict(int)
            self.word_sep = word_sep
            self.smooth_value = smooth_value
            self.doc_cnt = defaultdict(set)
            self.word_unq = set()
            self.stop_words = stop_words
    
        def get_tf_idf(self):
            filter_corpus = []
            for i, line in enumerate(self.corpus):
                if isinstance(line, str):
                    line = line.split(self.word_sep)
                line = [i for i in line if i not in self.stop_words]
                filter_corpus.append(line)
                for w in line:
                    self.vob[f'{i}_{w}'] += 1
                    self.doc_cnt[w].add(i)
                    self.word_unq.add(w)
            key_values = dict(zip(range(len(self.word_unq)), self.word_unq))
            output = np.zeros((len(self.corpus), len(self.word_unq)))
            for i, line in enumerate(filter_corpus):
                tmp_size = len(line)
                for j in range(output.shape[1]):
                    w = key_values[j]
                    w_ = f'{i}_{w}'
                    if w in line:
                        output[i, j] = self.vob[w_]/tmp_size*(1+np.log((output.shape[0]+self.smooth_value)/(self.smooth_value+len(self.doc_cnt[w]))))
            return output
    
    
    if __name__ == '__main__':
        # 每个列表类比为一本书
        corpus = [['this', 'is', 'a', 'simple', 'tfidf', 'code', 'but', 'code', 'might', 'has', 'bugs'],
                  ['python', 'is', 'a', 'code', 'language', 'not', 'human', 'language'],
                  ['learning', 'python', 'make', 'things', 'simple', 'but', 'not', 'simple', 'enough']]
        result = TFIDF(corpus, stop_words=['a'], smooth_value=1)
        print(result.get_tf_idf())
    
  • 相关阅读:
    密码加盐
    怎么查看别人网站信息
    Tomcat虚拟路径访问本地图片失败的问题
    SSM整合Swagger
    Can't read swagger JSON from http://localhost:8080/Test/api-docs
    swagger使用一新手篇
    @JsonFormat的导包问题
    Could not find acceptable representation
    PostMan打不开怎么解决
    Project facet Java version 1.8 not supported
  • 原文地址:https://www.cnblogs.com/laresh/p/12440051.html
Copyright © 2011-2022 走看看