zoukankan      html  css  js  c++  java
  • 朴树贝叶斯新闻分类系统

    基于搜狗语料库,建立的一个新闻分类系统;类别包括:

    classifierMap.put(0, "IT");
    classifierMap.put(1, "体育");
    classifierMap.put(2, "健康");
    classifierMap.put(3, "军事");
    classifierMap.put(4, "招聘");
    classifierMap.put(5, "教育");
    classifierMap.put(6, "文化");
    classifierMap.put(7, "旅游");
    classifierMap.put(8, "财经");

    分词器:中科院分词工具或者IK;本人采用IK分词器,通过测试发现速度快,内存消耗低,不会电脑死机,在训练数据的时候;训练集是下载的搜狗新闻数据集,对新闻分类

    算法步骤:

    1. 首先下载IK分词器和搜狗新闻训练集和搜狗词典(对词进行了词性标注,个人只选择了名词,考虑到内存和速度,准确率的因素)

    2. 对训练集分词处理,将属于不同类别的新闻分词处理,并去除,词频低于10的词,过滤掉,节省内存和提高速度的考虑;并以文本的形式保存,以类别定义文件名字

    3. 编写朴素贝叶斯分类函数,对输入文本进行分类处理,选择概率最大的作为分类类别

    4. web系统采用JSP+JavaBean+Servlet的架构,软件平台式新浪云;网址:http://naivebayes.sinaapp.com;如果是无法访问,应该是服务器没有开

    使用方式:输入文本,并点击新闻分类;

    主程序代码:

    package com.sogou.servlet;
    
    import java.io.IOException;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import javax.servlet.RequestDispatcher;
    import javax.servlet.ServletContext;
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    
    import com.sogou.util.BayesUtil;
    
    /**
     * Servlet implementation class BayesServlet
     */
    @WebServlet("/bayes.do")
    public class BayesServlet extends HttpServlet {
        private static final long serialVersionUID = 1L;
    
        /**
         * @see HttpServlet#HttpServlet()
         */
        public BayesServlet() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse
         *      response)
         */
        protected void doGet(HttpServletRequest request,
                HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            this.doPost(request, response);
        }
    
        /**
         * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse
         *      response)
         */
        @SuppressWarnings("unchecked")
        protected void doPost(HttpServletRequest request,
                HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            String newsText = request.getParameter("newsText");
            newsText = new String(newsText.getBytes("ISO8859-1"), "utf-8");
            ServletContext st = this.getServletContext();
            List<Map<String, Integer>> trainSets = (List<Map<String, Integer>>) st
                    .getAttribute("trainSets");
            Map<Integer, String> classifierMap = (Map<Integer, String>) st
                    .getAttribute("classifierMap");
            if (classifierMap == null) {
                classifierMap = new HashMap<Integer, String>();
                classifierMap.put(0, "IT");
                classifierMap.put(1, "体育");
                classifierMap.put(2, "健康");
                classifierMap.put(3, "军事");
                classifierMap.put(4, "招聘");
                classifierMap.put(5, "教育");
                classifierMap.put(6, "文化");
                classifierMap.put(7, "旅游");
                classifierMap.put(8, "财经");
                st.setAttribute("classifierMap", classifierMap);
            }
            BayesUtil bayes = new BayesUtil();
            if (trainSets == null) {
                String dirName = "D:/dataMing/bys";
                trainSets = bayes.loadTrainSet(dirName);
                st.setAttribute("trainSets", trainSets);
            }
            String classifier = bayes.bayesClassifierText(trainSets, newsText,
                    classifierMap);
            System.out.println(classifier);
            request.setAttribute("classifier", classifier);
            RequestDispatcher rd = request.getRequestDispatcher("./index.jsp");
            rd.forward(request, response);
        }
    
    }
    package com.sogou.util;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.LinkedList;
    import java.util.List;
    import java.util.Map;
    
    import org.wltea.analyzer.core.IKSegmenter;
    import org.wltea.analyzer.core.Lexeme;
    
    public class BayesUtil {
    
        /**
         * 加载训练集分类词典目录,对内容分类处理
         * 
         * @param dirName
         * @param content
         */
        public List<Map<String, Integer>> loadTrainSet(String dirName) {
            File directory = new File(dirName);
            File[] files = directory.listFiles();
            BufferedReader br = null;
            List<Map<String, Integer>> list = new ArrayList<>(files.length);
            // 加载字典
            for (int i = 0; i < files.length; i++) {
                try {
                    br = new BufferedReader(new FileReader(files[i]));
                    Map<String, Integer> hashMap = new HashMap<String, Integer>();
                    String line = null;
                    while ((line = br.readLine()) != null) {
                        String[] values = line.split("	");
                        hashMap.put(values[0], Integer.parseInt(values[1]));
                    }
                    list.add(hashMap);
                } catch (FileNotFoundException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } finally {
                    try {
                        br.close();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
            // 对传入文本或者文件处理
            return list;
        }
    
        /**
         * 对传入的文本分类处理
         * 
         * @param content
         */
        public String bayesClassifierText(List<Map<String, Integer>> trainSets,
                String content, Map<Integer, String> textClassifier) {
            IKSegmenter ik = new IKSegmenter(new StringReader(content), true);
            Lexeme value = null;
    
            List<String> list = new LinkedList<String>();
            String text = null;
            try {
                while ((value = ik.next()) != null) {
                    text = value.getLexemeText();
                    if (text.length() >= 2) {
                        list.add(text);
                    }
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            int length = trainSets.size();
            long[] maxCfVal = new long[length];
            int[] wordsCount = new int[length];
            boolean flag = false;
            for (String tt : list) {
                for (int i = 0; i < length; i++) {
                    if (!flag) {
                        wordsCount[i] = trainSets.get(i).get("wordsCount");
                    }
                    Integer iv = trainSets.get(i).get(tt);
                    if (iv != null) {
                        maxCfVal[i] += Math.log((float) iv / wordsCount[i]);
                    } else {
                        maxCfVal[i] += Math.log(1.0 / (wordsCount[i]));
                    }
                }
                flag = true;
            }
            long maxValue = maxCfVal[0];
            int index = 0;
            for (int i = 1; i < length; i++) {
                if (maxCfVal[i] > maxValue) {
                    index = i;
                    maxValue = maxCfVal[i];
                }
            }
            return textClassifier.get(index);
        }
    
        /**
         * 对传入的文本文件分类
         * 
         * @param fileName
         */
        public void bayesClassifierFile(String fileName) {
    
        }
    }
  • 相关阅读:
    《DSP using MATLAB》Problem 6.17
    一些老物件
    《DSP using MATLAB》Problem 6.16
    《DSP using MATLAB》Problem 6.15
    《DSP using MATLAB》Problem 6.14
    《DSP using MATLAB》Problem 6.13
    《DSP using MATLAB》Problem 6.12
    《DSP using MATLAB》Problem 6.11
    P1414 又是毕业季II
    Trie树
  • 原文地址:https://www.cnblogs.com/csxf/p/3829698.html
Copyright © 2011-2022 走看看