zoukankan      html  css  js  c++  java
  • Naive Bayesian文本分类器

    贝叶斯学习方法中有用性非常高的一种为朴素贝叶斯学习期,常被称为朴素贝叶斯分类器。

    在某些领域中与神经网络和决策树学习相当。尽管朴素贝叶斯分类器忽略单词间的依赖关系。即如果全部单词是条件独立的,但朴素贝叶斯分类在实际应用中有非常出色的表现。

    朴素贝叶斯文本分类算法伪代码:


    朴素贝叶斯文本分类算法流程:


    通过计算训练集中每一个类别的概率与不同类别下每一个单词的概率,然后利用朴素贝叶斯公式计算新文档被分类为各个类别的概率。终于输出概率最大的类别。

    C++源代码:

    /*
    	Bayesian classifier for document classifiaction
    	15S103182
    	Ethan
    	2015.12.27
    */
    #include <iostream>
    #include <vector>
    #include <iterator>
    #include <map>
    #include <fstream>
    #include <iomanip>
    #include <sstream>
    using namespace std;
    int stringToInteger(string a){
    	stringstream ss;
    	ss<<a;
    	int b;
    	ss>>b;
    	return b;
    }
    vector<int> openClassificationFile(const char* dataset){
    	fstream file;
    	file.open(dataset,ios::in);
    	if(!file) 
        {
            cout <<"Open File Failed!" <<endl;
            vector<int> a;
            return a;
        } 
    	vector<int> data;
    	int i=1;
    	while(!file.eof()){
    		string temp;
    		file>>temp;
    		data.push_back(stringToInteger(temp));
    	}
    	file.close();
    	return data;
    }
    vector<string> openFile(const char* dataset){
    	fstream file;
    	file.open(dataset,ios::in);
    	if(!file) 
        {
            cout <<"Open File Failed!" <<endl;
            vector<string> a;
            return a;
        }
    	vector<string> data;
    	int i=1;
    	while(!file.eof()){
    		string temp;
    		file>>temp;
    		data.push_back(temp);
    	}
    	file.close();
    	for(int i=0;i<data.size();i++) cout<<data[i]<<"	";
    	cout<<endl;
    	cout<<"Open file successfully!"<<endl;
    	return data;
    }
    vector<vector<string> > openFiles(const vector<char*> files){
    	vector<vector<string> > docs;
    	for(int i=0;i<files.size();i++){
    		vector<string> t = openFile(files[i]);
    		docs.push_back(t);
    	}
    	return docs;
    }
    void bayesian(vector<vector<string> > docs,vector<int> c,vector<string> d){
    	map<string,int> wordFrequency;//每一个单词出现的个数 
    	map<int,float> cWordProbability;//类别单词频率 
    	map<int,int> cTotalFrequency;//类别单词个数
    	map<int,map<string,int> > cWordlTotalFrequency;//类别下单词个数 
    	int totalWords=0;
    	for(int i=0;i<docs.size();i++){
    		totalWords += docs[i].size();
    		cWordProbability[c[i]] = cWordProbability[c[i]] + docs[i].size();
    		map<string,int> sn; 
    		for(int j=0;j<docs[i].size();j++){
    			wordFrequency[docs[i][j]] = wordFrequency[docs[i][j]] + 1;
    			sn[docs[i][j]] = sn[docs[i][j]] + 1;
    		}
    		map<string,int>::iterator isn;
    		for(isn = sn.begin();isn!=sn.end();isn++){
    			cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second;
    		}
    	}
    	int tw = wordFrequency.size();
    	map<int,float>::iterator icWordProbability;
    	for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){
    		cTotalFrequency[icWordProbability->first] = icWordProbability->second;
    		cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords;
    	}
    	cout<<"Word Frequency:"<<endl;
    	map<string,int>::iterator iwordFrequency;
    	for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){
    		cout<<setw(8)<<iwordFrequency->first<<"	Frequency:"<<iwordFrequency->second<<endl;
    	}
    	cout<<"Conditional Probability:"<<endl;
    	map<string,int> dtw;//待分类文档词频 
    	for(int i=0;i<d.size();i++) dtw[d[i]] = dtw[d[i]] + 1;
    	map<string,map<int,float> > cp;//单词类别概率 
    	map<string,int>::iterator idtw;
    	for(idtw=dtw.begin();idtw!=dtw.end();idtw++){
    		map<int,float> cf;
    		for(int j=0;j<cTotalFrequency.size();j++){
    			float p=0;
    			p = (float)(cWordlTotalFrequency[j][idtw->first] +1) / (cTotalFrequency[j] + wordFrequency.size());
    			cf[j] = p;
    			cout<<"P("<<idtw->first<<"|"<<j<<") 	= "<<p<<endl;
    		}
    		cp[idtw->first] = cf;
    	}
    	cout<<"Classification Probability:"<<endl;
    	float mp = 0;
    	int classification=0;
    	for(int i=0;i<cTotalFrequency.size();i++){
    		float tcp=1;
    		for(int j=0;j<d.size();j++){
    			tcp = tcp * cp[d[j]][i];
    		}
    		tcp = tcp * cWordProbability[i];
    		cout<<"classification:"<<i<<"	"<<"Probability:"<<tcp<<endl;
    		if(mp<tcp) {
    			mp = tcp;
    			classification = i;
    		}
    	}
    	cout<<"The new document classification is:"<<classification<<endl;
    }
    
    int main(int argc, char** argv) {
    	vector<vector<string> > docs;
    	vector<int> c = openClassificationFile("classification.txt");
    	vector<char *> files;
    	files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt");
    	cout<<"训练文档集:"<<endl;
    	docs = openFiles(files);
    	vector<string> d;
    	cout<<"待分类文档:"<<endl; 
    	d = openFile("new.txt");
    	bayesian(docs,c,d);
    	return 0;
    }

    效果展示:


    结论:

    朴素贝叶斯分类器用于处理离散型的文本数据,可以有效对文本文档进行分类。在实验过程中,最困难的地方在于数据结构的设计。因为要统计每一个文档类别的频数和每一个文档类别下单词的概率,这个地方须要用到复杂映射与统计。在编码过程中经过不断的思考,终于通过多级映射的形式储存所需的数据,终于计算出新文档的类别。通过实验,成功将新的未分类文档输入样例分类为期待的文档类型。实验结果较为惬意。



  • 相关阅读:
    C语言利用fgetc复制拷贝文件内容
    linux 安装gcc 和 g++
    C宏定义和使用
    C的realloc的动态分配扩展和缩小内存
    C用malloc 向系统申请一个大小为n*4个字节的内存块
    GDB core命令的使用调试段错误
    GDB的安装
    C字符指针数组的使用
    C二维字符数组的使用及如何获取二维数组的总行数和总列数!
    C二维数组用指针地址遍历
  • 原文地址:https://www.cnblogs.com/gccbuaa/p/7224301.html
Copyright © 2011-2022 走看看