zoukankan      html  css  js  c++  java
  • TF-IDF

    参考源:
    http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了
    package com.data.text.tfidf;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.Map.Entry;
    
    public class TF_IDF {
    
        private  double NUM_DOCS;
    
        private  Map<String, Integer> idf_map;
        
        public TF_IDF(String fileName){
            idf_map = new HashMap<String, Integer>();
            File file = new File(fileName);
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
    
                //第一行为Num_docs
                tempString = reader.readLine();
                NUM_DOCS = (double)Integer.parseInt(tempString);
                
                // 一次读入一行,直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
                    String[] arr = tempString.split(" : ");
                    String key = arr[0];
                    Integer value = Integer.parseInt(arr[1]);
                    idf_map.put(key, value);
                }
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e1) {
                    }
                }
            }
        }
        
        
    
        public List<Feature> cacu(Map<String, Integer> tf_map) {
    
            // 统计总词数
            Integer word_num_sum = 0;
            for (Entry<String, Integer> entry : tf_map.entrySet()) {
                word_num_sum += entry.getValue();
            }
            
            //计算tf-idf
            List<Feature> list_fea = new ArrayList<Feature>();
            for (Entry<String, Integer> entry : tf_map.entrySet()) {
                String word = entry.getKey();
                Integer num = entry.getValue();
                double tf = (double) num / word_num_sum;
                double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率
                double weight = tf * idf;
                list_fea.add(new Feature(word, num, weight));
            }
            
            //根据权重排序
            Collections.sort(list_fea);
    
            return list_fea;
        }
    
        public static void main(String[] args) {
            // TODO Auto-generated method stub
    
        }
    
    }
    
    
    package com.data.text.tfidf;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.HashSet;
    import java.util.Set;
    
    public class StopWord {
        
        public static Set<String> GetStopWords(){
            String fileName = "stopwords.txt";
            return readwords(fileName);
        }
        
        /**
         * 读取停用词表
         * @param fileName
         * @return
         */
        private static Set<String> readwords(String fileName){
            Set<String> set = new HashSet<String>();
            File file = new File(fileName);
            BufferedReader reader = null;
            try {
                reader = new BufferedReader(new FileReader(file));
                String tempString = null;
                
                // 一次读入一行,直到读入null为文件结束
                while ((tempString = reader.readLine()) != null) {
                    set.add(tempString.trim());                
                }
                reader.close();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e1) {
                    }
                }
            }
            return set;
        }
    }
    
    
    package com.data.text.tfidf;
    
    /**
     * 特征词
     * @author root
     *
     */
    public class Feature implements Comparable<Feature> {
        private String word;
        private Integer num;
        private double weight;
    
        public Feature(String word, Integer num, double weight) {
            this.word = word;
            this.num = num;
            this.weight = weight;
        }
    
        public String getWord() {
            return word;
        }
    
        public Integer getNum() {
            return num;
        }
    
        public double getWeight() {
            return weight;
        }
    
        @Override
        public int compareTo(Feature o) {
            if(this.getWeight() == o.getWeight()){
                return 0;
            }else if(this.getWeight() > o.getWeight()){
                return -1;
            }else{
                return 1;
            }
        }
        
        public String toString(){
            return this.word + " freq: " + num + " weight: " + weight;
        }
    }
    __author__ = 'dell'
    
    import math
    import re
    from operator import itemgetter
    
    
    class TfIdf:
        def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):
            self.num_docs = 0
            self.term_num_docs = {}
            self.stopwords = []
            self.idf_default = DEFAULT_IDF
    
            if corpus_filename:
                corpus_file = open(corpus_filename, 'r')
                #load num of documents
                line = corpus_file.readline()
                self.num_docs = int(line)
                #read term:frequency from each subsequent line in the file
                for line in corpus_file:
                    tokens = line.split(':')
                    term = tokens[0].strip()
                    frequency = int(tokens[1].strip())
                    self.term_num_docs[term] = frequency
    
            if stopword_filename:
                stopword_file = open(stopword_filename)
                self.stopwords = [line.strip() for line in stopword_file]
    
        def get_tokens(self, str):
            return re.findall(r"<a.*?/a>|<[^>]*>|[w'@#]+", str.lower())
    
        def add_input_document(self, input):
            self.num_docs += 1
            words = set(self.get_tokens(input))
            for word in words:
                if word in self.term_num_docs:
                    self.term_num_docs[word] += 1
                else:
                    self.term_num_docs[word] = 1
    
        def get_num_docs(self):
            return self.num_docs
    
        def get_idf(self, term):
            if term in self.stopwords:
                return 0
            if term not in self.term_num_docs:
                return self.idf_default
            return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term]))
    
        def get_doc_keywords(self, curr_doc):
            tfidf = {}
            tokens = self.get_tokens(curr_doc)
            tokens_set = set(tokens)
            for word in tokens_set:
                tf = float(tokens.count(word) / len(tokens))
                idf = self.get_idf(word)
                tfidf[word] = tf * idf
            return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
  • 相关阅读:
    win7下virtualbox遇到的问题
    2.5年, 从0到阿里
    TCP/IP入门(4) --应用层
    TCP/IP入门(3) --传输层
    TCP/IP入门(2) --网络层
    TCP/IP入门(1) --链路层
    Socket编程实践(13) --UNIX域协议
    Socket编程实践(12) --UDP编程基础
    Socket编程实践(10) --select的限制与poll的使用
    Socket编程实践(9) --套接字IO超时设置方法
  • 原文地址:https://www.cnblogs.com/i80386/p/3240601.html
Copyright © 2011-2022 走看看