zoukankan      html  css  js  c++  java
  • 朴树贝叶斯新闻分类系统

    基于搜狗语料库,建立的一个新闻分类系统;类别包括:

    classifierMap.put(0, "IT");
    classifierMap.put(1, "体育");
    classifierMap.put(2, "健康");
    classifierMap.put(3, "军事");
    classifierMap.put(4, "招聘");
    classifierMap.put(5, "教育");
    classifierMap.put(6, "文化");
    classifierMap.put(7, "旅游");
    classifierMap.put(8, "财经");

    分词器:中科院分词工具或者IK;本人采用IK分词器,通过测试发现速度快,内存消耗低,不会电脑死机,在训练数据的时候;训练集是下载的搜狗新闻数据集,对新闻分类

    算法步骤:

    1. 首先下载IK分词器和搜狗新闻训练集和搜狗词典(对词进行了词性标注,个人只选择了名词,考虑到内存和速度,准确率的因素)

    2. 对训练集分词处理,将属于不同类别的新闻分词处理,并去除,词频低于10的词,过滤掉,节省内存和提高速度的考虑;并以文本的形式保存,以类别定义文件名字

    3. 编写朴素贝叶斯分类函数,对输入文本进行分类处理,选择概率最大的作为分类类别

    4. web系统采用JSP+JavaBean+Servlet的架构,软件平台式新浪云;网址:http://naivebayes.sinaapp.com;如果是无法访问,应该是服务器没有开

    使用方式:输入文本,并点击新闻分类;

    主程序代码:

    package com.sogou.servlet;
    
    import java.io.IOException;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import javax.servlet.RequestDispatcher;
    import javax.servlet.ServletContext;
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    
    import com.sogou.util.BayesUtil;
    
    /**
     * Servlet implementation class BayesServlet
     */
    @WebServlet("/bayes.do")
    public class BayesServlet extends HttpServlet {
        private static final long serialVersionUID = 1L;
    
        /**
         * @see HttpServlet#HttpServlet()
         */
        public BayesServlet() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse
         *      response)
         */
        protected void doGet(HttpServletRequest request,
                HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            this.doPost(request, response);
        }
    
        /**
         * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse
         *      response)
         */
        @SuppressWarnings("unchecked")
        protected void doPost(HttpServletRequest request,
                HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            String newsText = request.getParameter("newsText");
            newsText = new String(newsText.getBytes("ISO8859-1"), "utf-8");
            ServletContext st = this.getServletContext();
            List<Map<String, Integer>> trainSets = (List<Map<String, Integer>>) st
                    .getAttribute("trainSets");
            Map<Integer, String> classifierMap = (Map<Integer, String>) st
                    .getAttribute("classifierMap");
            if (classifierMap == null) {
                classifierMap = new HashMap<Integer, String>();
                classifierMap.put(0, "IT");
                classifierMap.put(1, "体育");
                classifierMap.put(2, "健康");
                classifierMap.put(3, "军事");
                classifierMap.put(4, "招聘");
                classifierMap.put(5, "教育");
                classifierMap.put(6, "文化");
                classifierMap.put(7, "旅游");
                classifierMap.put(8, "财经");
                st.setAttribute("classifierMap", classifierMap);
            }
            BayesUtil bayes = new BayesUtil();
            if (trainSets == null) {
                String dirName = "D:/dataMing/bys";
                trainSets = bayes.loadTrainSet(dirName);
                st.setAttribute("trainSets", trainSets);
            }
            String classifier = bayes.bayesClassifierText(trainSets, newsText,
                    classifierMap);
            System.out.println(classifier);
            request.setAttribute("classifier", classifier);
            RequestDispatcher rd = request.getRequestDispatcher("./index.jsp");
            rd.forward(request, response);
        }
    
    }
    package com.sogou.util;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.LinkedList;
    import java.util.List;
    import java.util.Map;
    
    import org.wltea.analyzer.core.IKSegmenter;
    import org.wltea.analyzer.core.Lexeme;
    
    public class BayesUtil {
    
        /**
         * 加载训练集分类词典目录,对内容分类处理
         * 
         * @param dirName
         * @param content
         */
        public List<Map<String, Integer>> loadTrainSet(String dirName) {
            File directory = new File(dirName);
            File[] files = directory.listFiles();
            BufferedReader br = null;
            List<Map<String, Integer>> list = new ArrayList<>(files.length);
            // 加载字典
            for (int i = 0; i < files.length; i++) {
                try {
                    br = new BufferedReader(new FileReader(files[i]));
                    Map<String, Integer> hashMap = new HashMap<String, Integer>();
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] values = line.split("	");
                        hashMap.put(values[0], Integer.parseInt(values[1]));
                    }
                    list.add(hashMap);
                } catch (FileNotFoundException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } finally {
                    try {
                        br.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            // 对传入文本或者文件处理
            return list;
        }
    
        /**
         * 对传入的文本分类处理
         * 
         * @param content
         */
        public String bayesClassifierText(List<Map<String, Integer>> trainSets,
                String content, Map<Integer, String> textClassifier) {
            IKSegmenter ik = new IKSegmenter(new StringReader(content), true);
            Lexeme value = null;
    
            List<String> list = new LinkedList<String>();
            String text = null;
            try {
                while ((value = ik.next()) != null) {
                    text = value.getLexemeText();
                    if (text.length() >= 2) {
                        list.add(text);
                    }
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            int length = trainSets.size();
            long[] maxCfVal = new long[length];
            int[] wordsCount = new int[length];
            boolean flag = false;
            for (String tt : list) {
                for (int i = 0; i < length; i++) {
                    if (!flag) {
                        wordsCount[i] = trainSets.get(i).get("wordsCount");
                    }
                    Integer iv = trainSets.get(i).get(tt);
                    if (iv != null) {
                        maxCfVal[i] += Math.log((float) iv / wordsCount[i]);
                    } else {
                        maxCfVal[i] += Math.log(1.0 / (wordsCount[i]));
                    }
                }
                flag = true;
            }
            long maxValue = maxCfVal[0];
            int index = 0;
            for (int i = 1; i < length; i++) {
                if (maxCfVal[i] > maxValue) {
                    index = i;
                    maxValue = maxCfVal[i];
                }
            }
            return textClassifier.get(index);
        }
    
        /**
         * 对传入的文本文件分类
         * 
         * @param fileName
         */
        public void bayesClassifierFile(String fileName) {
    
        }
    }
  • 相关阅读:
    H5 20-属性选择器上
    H5 19-序选择器下
    H5 18-序选择器
    算法基础部分整理-《图解算法》
    iOS 内存管理之属性关键字
    多线程 ---基础定义部分
    Mongodb基本命令总结
    MySQL四种隔离级别和MVCC
    python同步原语--线程锁
    python进程间通信--信号Signal
  • 原文地址:https://www.cnblogs.com/csxf/p/3829698.html
Copyright © 2011-2022 走看看