zoukankan      html  css  js  c++  java
  • 基于贝叶斯的垃圾邮件过滤器 JAVA

    <span style="font-size: 18px;">package cn.zhf.test;  
    
    import java.io.*;  
    import java.util.*;  
    
    public class SpamMailDetection {  
        public static final String BASE_PATH = "C:\Users\zhf\Desktop\mail";  
        public static final String SPAM_PATH = BASE_PATH + "\train_illegal.txt";//垃圾邮件语料  
        public static final String OK_PATH = BASE_PATH + "\train_legal.txt";//正常邮件语料  
        public static final String EMAIL_PATH = BASE_PATH + "\to_judge.txt";//要判别的邮件  
        public static final String DICT_PATH = BASE_PATH + "\dict.txt";//分词用的词典  
    
        public static void main(String[] args) {  
            SpamMailDetection smc = new SpamMailDetection();  
            //<word,(word/NonSpamCorpus)>  
            Map<String, Double> okmap = smc.createMailMap(OK_PATH);  
            //<word,(word/SpamCorpus)>  
            Map<String, Double> spammap = smc.createMailMap(SPAM_PATH);  
            Map<String, Double> ratemap = smc.createSpamProbabilityMap(spammap, okmap);  
            double probability = smc.judgeMail(EMAIL_PATH, ratemap);  
            if (probability > 0.5)//概率大于0.5则判定为垃圾  
                System.out.println("It's an ok mail.");  
            else  
                System.out.println("It's a spam mail.");  
    
        }  
    
        /** 
         * 给定邮件,分词,根据分词结果判断是垃圾邮件的概率  
         * P(Spam|t1,t2,t3……tn)=(P1*P2*……PN)/(P1*P2*……PN+(1-P1)*(1-P2)*……(1-PN)) 
         */  
        public double judgeMail(String emailPath, Map<String, Double> ratemap) {  
            List<String> list = segment(readFile(emailPath));  
            double rate = 1.0;  
            double tempRate = 1.0;  
            for (String str : list) {  
                if (ratemap.containsKey(str)) {  
                    double tmp = ratemap.get(str);  
                    tempRate *= 1 - tmp;  
                    rate *= tmp;  
                }  
            }  
            return rate / (rate + tempRate);  
        }  
    
        /** 
         * 从给定的垃圾邮件、正常邮件语料中建立map <切出来的词,出现的频率> 
         */  
        public Map<String, Double> createMailMap(String filePath) {  
            String str = readFile(filePath);  
            List<String> list = segment(str);  
            Map<String, Integer> tmpmap = new HashMap<String, Integer>();  
            Map<String, Double> retmap = new HashMap<String, Double>();  
            double rate = 0.0;  
            int count = 0;  
            for (String s : list) {  
                tmpmap.put(s, tmpmap.containsKey(s) ? count + 1 : 1);  
            }  
            for (Iterator iter = tmpmap.keySet().iterator(); iter.hasNext();) {  
                String key = (String) iter.next();  
                rate = tmpmap.get(key) / list.size();  
                retmap.put(key, rate);  
            }  
            return retmap;  
        }  
    
        /** 
         * 建立map,<str,rate> 邮件中出现ti时,该邮件为垃圾邮件的概率 
         * P( Spam|ti) =P2(ti )/((P1 (ti ) +P2 ( ti )) 
         */  
        public Map<String, Double> createSpamProbabilityMap(Map<String, Double> spammap,  
                Map<String, Double> okmap) {  
            Map<String, Double> retmap = new HashMap<String, Double>();  
            for (Iterator iter = spammap.keySet().iterator(); iter.hasNext();) {  
                String key = (String) iter.next();  
                double rate = spammap.get(key);  
                double allRate = rate;  
                if (okmap.containsKey(key)) {  
                    allRate += okmap.get(key);  
                }  
                retmap.put(key, rate / allRate);  
            }  
            return retmap;  
        }  
    
        /** 
         * 中文分词 
         */  
        public List<String> segment(String str) {  
            Map<String, Integer> map = loadDict();  
            List<String> list = new ArrayList<String>();  
            int len = str.length();  
            String term;  
            int maxSize = 6;  
            int i = 0, j = 0;  
            while (i < len) {  
                int n = i + maxSize < len ? i + maxSize : len + 1;  
                boolean findFlag = false;  
                for (j = n - 1; j > i; j--) {  
                    term = str.substring(i, j);  
                    if (map.containsKey(term)) {  
                        list.add(term);  
                        findFlag = true;  
                        i = j;  
                        break;  
                    }  
                }  
                if (findFlag == false)  
                    i = j + 1;  
            }  
            return list;  
        }  
    
        /** 
         * 加载词典文件 
         */  
        public Map<String, Integer> loadDict() {  
            Map<String, Integer> map = new HashMap<String, Integer>();  
            String[] str;  
            try {  
                BufferedReader br = new BufferedReader(new InputStreamReader(  
                        new FileInputStream(new File(DICT_PATH)), "gbk"));  
                String tmp = "";  
                while ((tmp = br.readLine()) != null) {  
                    str = tmp.split("	");  
                    map.put(str[0], 0);  
                }  
                br.close();  
            } catch (FileNotFoundException e) {  
                e.printStackTrace();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            return map;  
        }  
    
        /** 
         * 读文件 
         */  
        public String readFile(String filePath) {  
            String str = "";  
            try {  
                BufferedReader br = new BufferedReader(new InputStreamReader(  
                        new FileInputStream(new File(filePath)), "gbk"));  
                String tmp = "";  
                while ((tmp = br.readLine()) != null)  
                    str += tmp;  
                br.close();  
            } catch (FileNotFoundException e) {  
                e.printStackTrace();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            return str;  
        }  
    
    }  
    </span>  
  • 相关阅读:
    spring mvc---web.xml
    javascript:;与javascript:void(0)使用介绍
    JVM的内存区域划分
    获取配置文件内容
    spring获取webapplicationcontext,applicationcontext几种方法详解
    Spring MVC 中 HandlerInterceptorAdapter的使用(拦截器)
    google开发工具指南
    深克隆
    IO优化
    UML类图
  • 原文地址:https://www.cnblogs.com/fonttian/p/7294817.html
Copyright © 2011-2022 走看看