zoukankan      html  css  js  c++  java
  • rf-idf的java实现

    还存在的问题是,对于其中分词借助的库还存在问题

    参考此篇链接

    http://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html

    具体代码部分:

    具体代码在老电脑linux系统中

    下面这个类:主要是,1列出某个目录下的所有文件名。2,读取某个特定文件

    package com.bobo.paper.util;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.List;
    
    public class FileUtil {
     
        public static ArrayList<String> FileList = new ArrayList<String>(); // the list of file
    /**
     * 列出某個目錄及其子目錄下所有的文件列表
     * @param filepath 目錄路徑
     * @return 該路徑及其子路經下的所有文件名列表
     * @throws FileNotFoundException
     * @throws IOException
     */
        public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException
        {
            try
            {
                File file = new File(filepath);
                if(!file.isDirectory())
                {
                    System.out.println("输入的不是目錄名称;");
                    System.out.println("filepath:" + file.getAbsolutePath());
                }
                else
                {
                    String[] flist = file.list();
                    for(int i = 0; i < flist.length; i++)
                    {
                        File newfile = new File(filepath + "/" + flist[i]);
                        if(!newfile.isDirectory())
                        {
                            FileList.add(newfile.getAbsolutePath());
                        }
                        else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
                        {
                            readDirs(filepath + "/" + flist[i]);
                        }                    
                    }
                }
            }catch(FileNotFoundException e)
            {
                System.out.println(e.getMessage());
            }
            return FileList;
        }
        /**
         * 讀取文件內容,以字符串的方式返回
         * @param file 需要讀取的文件名
         * @return 返回讀取的文件內容構成的字符串,行之間用
    進行分割
         * @throws FileNotFoundException
         * @throws IOException
         */
        public static String readFile(String file) throws FileNotFoundException, IOException
        {
            StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed.
            InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
            BufferedReader br = new BufferedReader(inStrR); 
            String line = br.readLine();
            while(line != null){
                strSb.append(line).append("
    ");
                line = br.readLine();    
            }
            
            return strSb.toString();
        }
        
        
    
    }
    FileUtil

    下面这个类主要用于分词

    package com.bobo.paper.util;
    
    import java.io.IOException;
    import java.util.ArrayList;
    
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    public class CutWordsUtil {
    
    
        /**
         * 进行分词操作
         * @param file
         * @return
         * @throws IOException
         */
        public static ArrayList<String> cutWords(String file) throws IOException{
            
            ArrayList<String> words = new ArrayList<String>();
            String text = FileUtil.readFile(file);
            IKAnalyzer analyzer = new IKAnalyzer();
            // 这里貌似缺少一个分词jar包进一步依赖的包?
            
          // analyzer.split(text);
            //这个分词的工具,回头要以下即可
            return null;
                
            
             
        }
    
         
        
    
    }
    CutWords

    下面这个类主要实现tf-idf算法

    package com.bobo.paper.athology;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    
    import com.bobo.paper.util.CutWordsUtil;
    import com.bobo.paper.util.FileUtil;
    
    public class TfIdfAthology {
     
         /**
          * 统计各个词语列表中各个词语出现的次数
          * @param cutwords 分词之后的词语列表
          * @return 返回一个hashmap,key为词,value为词出现的次数
          */
        public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){
            HashMap<String, Integer> resTF = new HashMap<String, Integer>();
            
            for(String word : cutwords){
                if(resTF.get(word) == null){
                    resTF.put(word, 1);
                    System.out.println(word);
                }
                else{
                    resTF.put(word, resTF.get(word) + 1);
                    System.out.println(word.toString());
                }
            }
            return resTF;
        }
        /**
         * 统计词频,即tf值
         * @param cutwords 分词之后的词语列表
         * @return
         */
        public static HashMap<String, Float> tf(ArrayList<String> cutwords){
            HashMap<String, Float> resTF = new HashMap<String, Float>();
            
            int wordLen = cutwords.size();
            HashMap<String, Integer> intTF = normalTF(cutwords); 
            
            Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF
            while(iter.hasNext()){
                Map.Entry entry = (Map.Entry)iter.next();
                resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
                System.out.println(entry.getKey().toString() + " = "+  Float.parseFloat(entry.getValue().toString()) / wordLen);
            }
            return resTF;
        } 
       /**
        * 将以个目录下所有的文件进行分词,返回一个HashMap<String, HashMap<String, Integer>> ,前面一个key是文件名,后面一个key是词,其值为该词出现的次数
        * @param dirc
        * @return
        * @throws IOException
        */
        public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{
            HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>();
            List<String> filelist = FileUtil.readDirs(dirc);
            for(String file : filelist){
                HashMap<String, Integer> dict = new HashMap<String, Integer>();
                ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut word for one file
                
                dict =  normalTF(cutwords);
                allNormalTF.put(file, dict);
            }    
            return allNormalTF;
        }
        /**
         * 計算一個目錄下所有文件中詞語的詞頻
         * @param dirc 目錄名稱
         * @return 返回一個HashMap<String,HashMap<String, Float>>,第一個key是文件名,第二個key是詞,value是該詞語在該文件中的頻率
         * @throws IOException
         */
        public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{
            HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>();
            List<String> filelist = FileUtil.readDirs(dirc);
            
            for(String file : filelist){
                HashMap<String, Float> dict = new HashMap<String, Float>();
                ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut words for one file
                
                dict = tf(cutwords);
                allTF.put(file, dict);
            }
            return allTF;
        }
        /**
         * 計算词语的idf值 log(|D|/{包含该词语的文档个数+1})
         * @param all_tf 爲HashMap<String,HashMap<String, Float>>,第一個key爲文件名,第二個key爲詞語,float代表該詞語在本文件中的詞頻
         * @return
         */
        public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){
            HashMap<String, Float> resIdf = new HashMap<String, Float>();
            //dict的key值为词,其value为出现该词的文档个数
            HashMap<String, Integer> dict = new HashMap<String, Integer>();
            int docNum = FileUtil.FileList.size();
            //循环所有的文件
            for(int i = 0; i < docNum; i++){
                //all_tf中記錄的是
                HashMap<String, Float> temp = all_tf.get(FileUtil.FileList.get(i));
                Iterator iter = temp.entrySet().iterator();
                
                while(iter.hasNext()){
                    //循环一个文件中的所有词语的词频
                    Map.Entry entry = (Map.Entry)iter.next();
                    String word = entry.getKey().toString();
                    //IDF的公式,idfi=log(|D|/|{j:ti屬於dj}|),其中|D|爲語料庫中的文件總數目,|{j:ti屬於dj}|指的是包含詞語ti的文件數目,如果该词语不在语料库中,就会导致被除数为零,因此一般情况下使用1 + |{j : t_{i} in d_{j}}|
                    if(dict.get(word) == null){
                        dict.put(word, 1);
                    }else {
                        dict.put(word, dict.get(word) + 1);
                    }
                }
            }
            System.out.println("IDF for every word is:");
            Iterator iter_dict = dict.entrySet().iterator();
            while(iter_dict.hasNext()){
                Map.Entry entry = (Map.Entry)iter_dict.next();
                float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
                resIdf.put(entry.getKey().toString(), value);
                System.out.println(entry.getKey().toString() + " = " + value);
            }
            return resIdf;
        }
        /**
         * 计算某个词语的idf值
         * @param all_tf  记录所有词语tf值的map,第一个key为文件名,第二个key为词语
         * @param idfs  记录所有词语idf值的map,key为词语
         */
        public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){
            HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>();
            int docNum = FileUtil.FileList.size();
            for(int i = 0; i < docNum; i++){
                String filepath = FileUtil.FileList.get(i);
                HashMap<String, Float> tfidf = new HashMap<String, Float>();
                HashMap<String, Float> temp = all_tf.get(filepath);
                Iterator iter = temp.entrySet().iterator();
                while(iter.hasNext()){
                    Map.Entry entry = (Map.Entry)iter.next();
                    String word = entry.getKey().toString();
                    Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word); 
                    tfidf.put(word, value);
                }
                resTfIdf.put(filepath, tfidf);
            }
            System.out.println("TF-IDF for Every file is :");
            DisTfIdf(resTfIdf);
        }
        //這個主要用來顯示最終計算得到的tf-idf值
        public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){
            Iterator iter1 = tfidf.entrySet().iterator();
            while(iter1.hasNext()){
                Map.Entry entrys = (Map.Entry)iter1.next();
                System.out.println("FileName: " + entrys.getKey().toString());
                System.out.print("{");
                HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue();
                Iterator iter2 = temp.entrySet().iterator();
                while(iter2.hasNext()){
                    Map.Entry entry = (Map.Entry)iter2.next(); 
                    System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", ");
                }
                System.out.println("}");
            }
            
        }
    }
    tfIdfAthology

    最终的调用方法为:

    package com.bobo.paper;
    
    import java.io.IOException;
    import java.util.HashMap;
    
    import com.bobo.paper.athology.TfIdfAthology;
    
    public class Welcome {
    
        /**
         * @param args
         */
        public static void main(String[] args) {
             
     
               String file = "D:/testfiles";
    
                HashMap<String, HashMap<String, Float>> all_tf;
                try {
                    all_tf = TfIdfAthology.tfAllFiles(file);
                    System.out.println();
                    HashMap<String, Float> idfs = TfIdfAthology.idf(all_tf);
                    System.out.println();
                    TfIdfAthology.tf_idf(all_tf, idfs);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                
        }
    
    }
    Test
  • 相关阅读:
    475. Heaters
    69. Sqrt(x)
    83. Remove Duplicates from Sorted List Java solutions
    206. Reverse Linked List java solutions
    100. Same Tree Java Solutions
    1. Two Sum Java Solutions
    9. Palindrome Number Java Solutions
    112. Path Sum Java Solutin
    190. Reverse Bits Java Solutin
    202. Happy Number Java Solutin
  • 原文地址:https://www.cnblogs.com/bobodeboke/p/3493035.html
Copyright © 2011-2022 走看看