zoukankan      html  css  js  c++  java
  • logistic回归 c++ 实现

      logistic回归是统计学习中经典的分类方法,他属于对数线性模型。本博文主要给出logistic的c++实现,具体理论请读者自行google。

      本文用到的数据集来自于一个医学网站,具体出处不记得了(非常歉意)。数据的格式如下:

      10009 1 0 0 1 0 1
      10025 0 0 1 2 0 0
      20035 0 0 1 0 0 1
      20053 1 0 0 0 0 0
      30627 1 0 1 2 0 0
      30648 2 0 0 0 1 0

    每行有7个列值,第一列是一个ID号,在具体操作中,忽略该列。之后的5列,每一个都表示一个特征的取值;最后一列是分类标记(0或1)。

      在具体实现时,将分隔数据为训练数据和测试数据,并保存到文件中,文件组织形式如下:

      

    其中testdata.txt,保存测试数据;traindata.txt保存训练数据;logistic.cpp是代码源文件。三个文件保存在同一目录下。

    实现代码如下:

    /*********
    	logistic回归(c++) by 姜富春
    **********/
    #include<iostream>
    #include<fstream>
    #include<vector>
    #include<sstream>
    #include<cmath>
    using namespace std;
    struct Data{
    	vector<int> features;
    	int cls;
    	Data(vector<int> f,int c):features(f),cls(c){
    		
    	}
    };
    struct Param{
    	vector<double> w;
    	double d;
    	Param(vector<double> w1,double d1):w(w1),d(d1){};
    	Param():w(vector<double>()),d(0.0){}
    };
    class Logistic{
    public:
    	Logistic(){
    		//载入traindata文件构造dataSet;
    		loadDataSet(dataSet);
    		//初始化Param,w的长度与数据特征的长度相同,初值为0.0。d的初值也为0.0
    		vector<double> pw(dataSet[0].features.size(),0.0);
    		Param pt(pw,0.0);
    		param=pt;
    
    	};
    	void loadDataSet(vector<Data>& ds,string dataFile="./traindata.txt"){
    		ifstream fin(dataFile.c_str());
    		if(!fin){
    			cout<<"文件打开失败"<<endl;
    			exit(0);
    		}
    		while(fin){
    			string line;
    			getline(fin,line);
    			if(line.size()>3){
    				stringstream sin(line);
    				int t;
    				sin>>t;
    				vector<int> fea;
    				while(sin){
    					char c=sin.peek();
    					if(int(c)!=-1){
    						sin>>t;
    						fea.push_back(t);
    					}
    				
    				}
    				int cl=fea.back();
    				fea.pop_back();
    				ds.push_back(Data(fea,cl));
    			}
    		}
    	}
    	
    	void displayDataSet(){
    		for(int i=0;i<dataSet.size();i++){
    			for(int j=0;j<dataSet[i].features.size();j++){
    				cout<<dataSet[i].features[j]<<" ";
    			}
    			cout<<" 分类:"<<dataSet[i].cls;
    			cout<<endl;
    		}
    	}
    	void logisticRegression(){
    		//由目标函数为最大似然,因此最终求得的是目标函数的最大值,
    		//因此迭代过程是梯度上升,而非梯度下降
    		double lamda=0.1;//梯度下降的步长
    		double delta=0.0001;//结束迭代的阈值
    		//目标函数的值
    		double objLw=Lw(param);
    		//cout<<objLw<<endl;
    		Param tpa(param.w,param.d);
    		gradient(lamda);
    		double newObjLw=Lw(param);
    		int iter=0;
    		cout<<"初始:"<<endl;
    		displayIterProcess(iter,objLw,newObjLw,1);
    		while(fabs(newObjLw-objLw)>delta||!samewb(tpa,param,delta)){
    			objLw=newObjLw;
    			tpa=Param(param.w,param.d);
    			gradient(lamda);
    			newObjLw=Lw(param);
    			++iter;
    			displayIterProcess(iter,objLw,newObjLw,5);
    		}
    		cout<<"迭代结束共迭代"<<iter<<"步"<<endl;
    		displayIterProcess(iter,objLw,newObjLw,1);
    
    	}
    	bool samewb(const Param &tparam,const Param& param,double delta){
    		for(int i=0;i<tparam.w.size();i++){
    			if(fabs(tparam.w[i]-param.w[i])>delta){
    				return false;
    			}
    		}
    		if(fabs(tparam.d-param.d)>delta){
    			return false;
    		}
    		return true;
    	}
    	void displayIterProcess(int iter,double objLw,double newObjLw,int mod){
    		//每mod步打印一次迭代过程
    		if(iter%mod==0){
    			cout<<"迭代"<<iter<<":目标函数值【"<<newObjLw<<"】,两次迭代目标函数差值【 "<<(newObjLw-objLw)<<"】"<<endl;
    			cout<<"模型参数:";
    			for(int i=0;i<param.w.size();i++){
    				cout<<param.w[i]<<" ";
    			}
    			cout<<param.d<<endl<<endl;
    		}
    		
    	}
    	//梯度上升更新w和b
    	void gradient(double lam){
    		for(int i=0;i<param.w.size();i++){
    			double tmp=0.0L;//保存梯度上升过程的中间值
    			for(int j=0;j<dataSet.size();j++){
    				tmp+=(dataSet[j].cls-logiFun(param,dataSet[j]))*dataSet[j].features[i]*lam;
    			}
    			param.w[i]+=(tmp);
    		}
    		double tmp=0.0L;
    		for(int j=0;j<dataSet.size();j++){
    			tmp+=(dataSet[j].cls-logiFun(param,dataSet[j]))*lam;
    		}
    		param.d+=tmp;
    	
    	}
    	//计算logistic函数的值,即f(x)=exp(wx)/(1+exp(wx)),该表达式在求解梯度过程中出现,
    	//因此计算这个值是为了辅助梯度上升计算过程
    	inline double logiFun(const Param &p,const Data &d){
    		double inner=innerWX(p,d);
    		double le=exp(inner)/(1+exp(inner));
    		return le;
    	}
    	//计算对数似然函数的值
    	double Lw(Param p){
    		double l=0.0L;
    		for(int i=0;i<dataSet.size();i++){
    		    double inner=innerWX(p,dataSet[i]);
    			l+=(dataSet[i].cls*inner-(log10(1+exp(inner))));
    			//cout<<"l="<<l<<endl;
    		}
    		
    		return l;
    	}
    	//计算wx+b的值
    	inline double innerWX(const Param &p,const Data &data){
    		if(p.w.size()!=data.features.size()){
    			cout<<"参数与实例的维度不匹配,不能进行内积计算"<<endl;
    			exit(0);
    		}
    		double innerP=0.0L;
    		for(int i=0;i<p.w.size();i++){
    			innerP+=(p.w[i]*data.features[i]);
    		}
    		innerP+=p.d;
    		return innerP;
    	}
    	//给定测试集,预测分类
    	void predictClass(){
    		vector<Data> testDataSet;
    		loadDataSet(testDataSet,"./testdata.txt");
    		/*******************
    		分别计算
    			P(Y=1|x)=exp(w.x)/(1+exp(w.x))
    		和
    			P(Y=0|x)=1/(1+exp(w.x))
    		然后取值大的作为x的分类
    		*******************/
    		cout<<endl<<"预测分类:"<<endl;
    		for(int i=0;i<testDataSet.size();i++){
    			double py1=0.0L;
    			double py0=0.0L;
    			double inner=innerWX(param,testDataSet[i]);
    			py1=exp(inner)/(1+exp(inner));
    			py0=1-py1;
    			cout<<"实例: ";
    			for(int j=0;j<testDataSet[i].features.size();j++){
    				cout<<testDataSet[i].features[j]<<" ";
    			}
    			cout<<"标记分类【"<<testDataSet[i].cls<<"】,";
    			if(py1>=py0){
    				
    				cout<<"预测分类【"<<1<<"】"<<endl;
    			}else{
    				cout<<"预测分类【"<<0<<"】"<<endl;
    			}
    		}
    	}
    private:
    	vector<Data> dataSet;
    	Param param;
    };
    int main(){
    	Logistic logist;
    	//logist.displayDataSet();
    	logist.logisticRegression();
    	logist.predictClass();
    	system("pause");
    	return 0;
    }
    

      程序运行结果如下:

    1. 迭代训练模型过程如下

      2. 测试分类过程如下:

    本例中分类测试的效果并不是太好,在调试的时候,看到这样的结果我也仔细地审查代码,并未发现程序的错误(也希望读者帮忙审查)。考虑可能的原因:

      (1)数据太少

      (2)数据特征不能提供足够信息

      (3)线性模型,无法很好地划分数据

    附件:

    训练数据:traindata.txt

    10009 1 0 0 1 0 1
    10025 0 0 1 2 0 0
    10038 1 0 0 1 1 0
    10042 0 0 0 0 1 0
    10049 0 0 1 0 0 0
    10113 0 0 1 0 1 0
    10131 0 0 1 2 1 0
    10160 1 0 0 0 0 0
    10164 0 0 1 0 1 0
    10189 1 0 1 0 0 0
    10215 0 0 1 0 1 0
    10216 0 0 1 0 0 0
    10235 0 0 1 0 1 0
    10270 1 0 0 1 0 0
    10282 1 0 0 0 1 0
    10303 2 0 0 0 1 0
    10346 1 0 0 2 1 0
    10380 2 0 0 0 1 0
    10429 2 0 1 0 0 0
    10441 0 0 1 0 1 0
    10443 0 0 1 2 0 0
    10463 0 0 0 0 0 0
    10475 0 0 1 0 1 0
    10489 1 0 1 0 1 1
    10518 0 0 1 2 1 0
    10529 1 0 1 0 0 0
    10545 0 0 1 0 0 0
    10546 0 0 0 2 0 0
    10575 1 0 0 0 1 0
    10579 2 0 1 0 0 0
    10581 2 0 1 1 1 0
    10600 1 0 1 1 0 0
    10627 1 0 1 2 0 0
    10653 1 0 0 1 1 0
    10664 0 0 0 0 1 0
    10691 1 1 0 0 1 0
    10692 1 0 1 2 1 0
    10711 0 0 0 0 1 0
    10714 0 0 1 0 0 0
    10739 1 0 1 1 1 0
    10750 1 0 1 0 1 0
    10764 2 0 1 2 0 0
    10770 0 0 1 2 1 0
    10780 0 0 1 0 1 0
    10784 2 0 1 0 1 0
    10785 0 0 1 0 1 0
    10788 1 0 0 0 0 0
    10815 1 0 0 0 1 0
    10816 0 0 0 0 1 0
    10818 0 0 1 2 1 0
    11095 0 1 1 0 0 0
    11146 0 1 0 0 1 0
    11206 2 1 0 0 0 0
    11223 2 1 0 0 0 0
    11236 1 1 0 2 0 0
    11244 1 1 0 0 0 1
    11245 0 1 0 0 0 0
    11278 2 1 0 0 1 0
    11322 0 1 0 0 1 0
    11326 2 1 0 2 1 0
    11329 2 1 0 2 1 0
    11344 1 1 0 2 1 0
    11358 0 1 0 0 0 1
    11417 2 1 1 0 1 0
    11421 2 1 0 1 1 0
    11484 1 1 0 0 0 1
    11499 2 1 0 0 0 0
    11503 1 1 0 0 1 0
    11527 1 1 0 0 0 0
    11540 2 1 0 1 1 0
    11580 1 1 0 0 1 0
    11583 1 0 1 1 0 1
    11592 2 1 0 1 1 0
    11604 0 1 0 0 1 0
    11625 1 0 1 0 0 0
    20035 0 0 1 0 0 1
    20053 1 0 0 0 0 0
    20070 0 0 0 2 1 0
    20074 1 0 1 2 0 1
    20146 1 0 0 1 1 0
    20149 2 0 1 2 1 0
    20158 2 0 0 0 1 0
    20185 1 0 0 1 1 0
    20193 1 0 1 0 1 0
    20194 0 0 1 0 0 0
    20205 1 0 0 2 1 0
    20206 2 0 1 1 1 0
    20265 0 0 1 0 1 0
    20311 0 0 0 0 1 0
    20328 2 0 0 1 0 1
    20353 0 0 1 0 0 0
    20372 0 0 0 0 0 0
    20405 1 0 1 1 1 1
    20413 2 0 1 0 1 0
    20427 0 0 0 0 0 0
    20455 1 0 1 0 1 0
    20462 0 0 0 0 1 0
    20472 0 0 0 2 0 0
    20485 0 0 0 0 0 0
    20523 0 0 1 2 0 0
    20539 0 0 1 0 1 0
    20554 0 0 1 0 0 1
    20565 0 0 0 2 1 0
    20566 1 0 1 1 1 0
    20567 1 0 0 1 1 0
    20568 0 0 1 0 1 0
    20569 1 0 0 0 0 0
    20571 1 0 1 0 1 0
    20581 2 0 0 0 1 0
    20583 1 0 0 0 1 0
    20585 2 0 0 1 1 0
    20586 0 0 1 2 1 0
    20591 1 0 1 2 0 0
    20595 0 0 1 2 1 0
    20597 1 0 0 0 0 0
    20599 0 0 1 0 1 0
    20607 0 0 0 1 1 0
    20611 1 0 0 0 1 0
    20612 2 0 0 1 1 0
    20614 1 0 0 1 1 0
    20615 1 0 1 0 0 0
    21017 1 1 0 1 1 0
    21058 2 1 0 0 1 0
    21063 0 1 0 0 0 0
    21084 1 1 0 1 0 1
    21087 1 1 0 2 1 0
    21098 0 1 0 0 0 0
    21099 1 1 0 2 0 0
    21113 0 1 0 0 1 0
    21114 1 1 0 0 1 1
    21116 1 1 0 2 1 0
    21117 1 0 0 2 1 0
    21138 2 1 1 1 1 0
    21154 0 1 0 0 1 0
    21165 0 1 0 0 1 0
    21181 2 1 0 0 0 1
    21183 1 1 0 2 1 0
    21231 1 1 0 0 1 0
    21234 1 1 1 0 0 0
    21286 2 1 0 2 1 0
    21352 2 1 1 1 0 0
    21395 0 1 0 0 1 0
    21417 1 1 0 2 1 0
    21423 0 1 0 0 1 0
    21426 1 1 0 1 1 0
    21433 0 1 0 0 1 0
    21435 0 1 0 0 0 0
    21436 1 1 0 0 0 0
    21439 1 1 0 2 1 0
    21446 1 1 0 0 0 0
    21448 0 1 1 2 0 0
    21453 2 1 0 0 1 0
    30042 2 0 1 0 0 1
    30080 0 0 1 0 1 0
    301003 1 0 1 0 0 0
    301009 0 0 1 2 1 0
    301017 0 0 1 0 0 0
    30154 1 0 1 0 1 0
    30176 0 0 1 0 1 0
    30210 0 0 1 0 1 0
    30239 1 0 1 0 1 0
    30311 0 0 0 0 0 1
    30382 0 0 1 2 1 0
    30387 0 0 1 0 1 0
    30415 0 0 1 0 1 0
    30428 0 0 1 0 0 0
    30479 0 0 1 0 0 1
    30485 0 0 1 2 1 0
    30493 2 0 1 2 1 0
    30519 0 0 1 0 1 0
    30532 0 0 1 0 1 0
    30541 0 0 1 0 1 0
    30567 1 0 0 0 0 0
    30569 2 0 1 1 1 0
    30578 0 0 1 0 0 1
    30579 1 0 1 0 0 0
    30596 1 0 1 1 1 0
    30597 1 0 1 1 0 0
    30618 0 0 1 0 0 0
    30622 1 0 1 1 1 0
    30627 1 0 1 2 0 0
    30648 2 0 0 0 1 0
    30655 0 0 1 0 0 1
    30658 0 0 1 0 1 0
    30667 0 0 1 0 1 0
    30678 1 0 1 0 0 0
    30701 0 0 1 0 0 0
    30703 2 0 1 1 0 0
    30710 0 0 1 2 0 0
    30713 1 0 0 1 1 1
    30716 0 0 0 0 1 0
    30721 0 0 0 0 0 1
    30723 0 0 1 0 1 0
    30724 2 0 1 2 1 0
    30733 1 0 0 1 0 0
    30734 0 0 1 0 0 0
    30736 2 0 0 1 1 1
    30737 0 0 1 0 0 0
    30740 0 0 1 0 1 0
    30742 2 0 1 0 1 0
    30743 0 0 1 0 1 0
    30745 2 0 0 0 1 0
    30754 1 0 1 0 1 0
    30758 1 0 0 0 1 0
    30764 0 0 1 0 0 1
    30765 2 0 0 0 0 0
    30769 2 0 0 1 1 0
    30772 0 0 1 0 1 0
    30774 0 0 0 0 1 0
    30784 2 0 1 0 0 0
    30786 1 0 1 0 1 0
    30787 0 0 0 0 1 0
    30789 1 0 1 0 1 0
    30800 0 0 1 0 0 0
    30801 1 0 1 0 1 0
    30803 1 0 1 0 1 0
    30806 1 0 1 0 1 0
    30817 0 0 1 2 0 0
    30819 2 0 1 0 1 1
    30822 0 0 1 0 1 0
    30823 0 0 1 2 1 0
    30834 0 0 0 0 0 0
    30836 0 0 1 0 1 0
    30837 1 0 1 0 1 0
    30840 0 0 1 0 1 0
    30841 1 0 1 0 0 0
    30844 0 0 1 0 1 0
    30845 0 0 1 0 0 0
    30847 1 0 1 0 0 0
    30848 0 0 1 0 1 0
    30850 0 0 1 0 1 0
    30856 1 0 0 0 1 0
    30858 0 0 1 0 0 0
    30860 0 0 0 0 1 0
    30862 1 0 1 1 1 0
    30864 0 0 0 2 0 0
    30867 0 0 1 0 1 0
    30869 0 0 1 0 1 0
    30887 0 0 1 0 1 0
    30900 1 0 0 1 1 0
    30913 2 0 0 0 1 0
    30914 1 0 0 0 0 0
    30922 2 0 0 2 1 0
    30923 0 0 1 2 1 0
    30927 1 0 1 0 0 1
    30929 0 0 1 2 1 0
    30933 0 0 1 2 1 0
    30940 0 0 1 0 1 0
    30943 1 0 1 2 1 0
    30945 0 0 0 2 0 0
    30951 1 0 0 0 0 0
    30964 0 0 0 2 1 0
    30969 0 0 1 0 1 0
    30979 2 0 0 0 1 0
    30980 1 0 0 0 0 0
    30982 1 0 0 1 1 0
    30990 1 0 1 1 1 0
    30991 1 0 1 0 1 1
    30999 0 0 1 0 1 0
    31056 1 1 0 2 1 0
    31068 1 1 0 1 0 0
    31108 2 1 0 2 1 0
    31168 1 1 1 0 0 0
    31191 0 1 1 0 0 0
    31229 0 1 1 0 0 1
    31263 0 1 0 0 1 0
    31281 1 1 1 0 0 0
    31340 1 1 1 0 1 0
    31375 0 1 0 0 1 0
    31401 0 1 1 0 0 1
    31480 1 1 1 1 1 0
    31501 1 1 0 2 1 0
    31514 0 1 0 2 0 0
    31518 1 1 0 2 1 0
    31532 0 0 1 2 1 0
    31543 2 1 1 1 1 0
    31588 0 1 0 0 1 0
    31590 0 0 1 0 1 0
    31591 2 1 0 1 1 0
    31595 0 1 0 0 1 0
    31596 1 1 0 0 0 0
    31598 1 1 0 0 1 0
    31599 0 1 0 0 0 0
    31605 0 1 1 0 0 0
    31612 2 1 0 0 1 0
    31615 2 1 0 0 0 0
    31628 1 1 0 0 1 0
    31640 2 1 0 1 1 0

    测试数据:testdata.txt

    10009 1 0 0 1 0 1
    10025 0 0 1 2 0 0
    20035 0 0 1 0 0 1
    20053 1 0 0 0 0 0
    30627 1 0 1 2 0 0
    30648 2 0 0 0 1 0

  • 相关阅读:
    软工实践个人总结
    Beta版本演示
    Beta 5
    Beta 4
    Beta 3
    Beta 2
    2020系统综合实践 期末大作业 02组
    2020系统综合实践 第7次实践作业 2组
    2020系统综合实践 第6次实践作业 2组
    2020系统综合实践 第5次实践作业
  • 原文地址:https://www.cnblogs.com/jfcspring/p/3512356.html
Copyright © 2011-2022 走看看