线性回归,逻辑回归
纪要:
逻辑回归实现代码(C++):
/********* logistic回归(c++) by 姜富春 **********/ #include<iostream> #include<fstream> #include<vector> #include<sstream> #include<cmath> using namespace std; struct Data{ vector<int> features; int cls; Data(vector<int> f, int c) :features(f), cls(c){ } }; struct Param{ vector<double> w; double d; Param(vector<double> w1, double d1) :w(w1), d(d1){}; Param() :w(vector<double>()), d(0.0){} }; class Logistic{ public: Logistic(){ //载入traindata文件构造dataSet; loadDataSet(dataSet); //初始化Param,w的长度与数据特征的长度相同,初值为0.0。d的初值也为0.0 vector<double> pw(dataSet[0].features.size(), 0.0); Param pt(pw, 0.0); param = pt; }; void loadDataSet(vector<Data>& ds, string dataFile = "./traindata.txt"){ ifstream fin(dataFile.c_str()); if (!fin){ cout << "文件打开失败" << endl; exit(0); } while (fin){ string line; getline(fin, line); if (line.size()>3){ stringstream sin(line); int t; sin >> t; vector<int> fea; while (sin){ char c = sin.peek(); if (int(c) != -1){ sin >> t; fea.push_back(t); } } int cl = fea.back(); fea.pop_back(); ds.push_back(Data(fea, cl)); } } } void displayDataSet(){ for (int i = 0; i<dataSet.size(); i++){ for (int j = 0; j<dataSet[i].features.size(); j++){ cout << dataSet[i].features[j] << " "; } cout << " 分类:" << dataSet[i].cls; cout << endl; } } void logisticRegression(){ //由目标函数为最大似然,因此最终求得的是目标函数的最大值, //因此迭代过程是梯度上升,而非梯度下降 double lamda = 0.1;//梯度下降的步长 double delta = 0.0001;//结束迭代的阈值 //目标函数的值 double objLw = Lw(param); //cout<<objLw<<endl; Param tpa(param.w, param.d); gradient(lamda); double newObjLw = Lw(param); int iter = 0; cout << "初始:" << endl; displayIterProcess(iter, objLw, newObjLw, 1); while (fabs(newObjLw - objLw)>delta || !samewb(tpa, param, delta)){ objLw = newObjLw; tpa = Param(param.w, param.d); gradient(lamda); newObjLw = Lw(param); ++iter; displayIterProcess(iter, objLw, newObjLw, 5); } cout << "迭代结束共迭代" << iter << "步" << endl; displayIterProcess(iter, objLw, newObjLw, 1); } bool samewb(const Param &tparam, const Param& param, double delta){ for (int i = 0; i<tparam.w.size(); i++){ if (fabs(tparam.w[i] - param.w[i])>delta){ return false; } } if (fabs(tparam.d - param.d)>delta){ return false; } return true; } void displayIterProcess(int iter, double objLw, double newObjLw, int mod){ //每mod步打印一次迭代过程 if (iter%mod == 0){ cout << "迭代" << iter << ":目标函数值【" << newObjLw << "】,两次迭代目标函数差值【 " << (newObjLw - objLw) << "】" << endl; cout << "模型参数:"; for (int i = 0; i<param.w.size(); i++){ cout << param.w[i] << " "; } cout << param.d << endl << endl; } } //梯度上升更新w和b void gradient(double lam){ for (int i = 0; i<param.w.size(); i++){ double tmp = 0.0L;//保存梯度上升过程的中间值 for (int j = 0; j<dataSet.size(); j++){ tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*dataSet[j].features[i] * lam; } param.w[i] += (tmp); } double tmp = 0.0L; for (int j = 0; j<dataSet.size(); j++){ tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*lam; } param.d += tmp; } //计算logistic函数的值,即f(x)=exp(wx)/(1+exp(wx)),该表达式在求解梯度过程中出现, //因此计算这个值是为了辅助梯度上升计算过程 inline double logiFun(const Param &p, const Data &d){ double inner = innerWX(p, d); double le = exp(inner) / (1 + exp(inner)); return le; } //计算对数似然函数的值 double Lw(Param p){ double l = 0.0L; for (int i = 0; i<dataSet.size(); i++){ double inner = innerWX(p, dataSet[i]); l += (dataSet[i].cls*inner - (log10(1 + exp(inner)))); //cout<<"l="<<l<<endl; } return l; } //计算wx+b的值 inline double innerWX(const Param &p, const Data &data){ if (p.w.size() != data.features.size()){ cout << "参数与实例的维度不匹配,不能进行内积计算" << endl; exit(0); } double innerP = 0.0L; for (int i = 0; i<p.w.size(); i++){ innerP += (p.w[i] * data.features[i]); } innerP += p.d; return innerP; } //给定测试集,预测分类 void predictClass(){ vector<Data> testDataSet; loadDataSet(testDataSet, "./testdata.txt"); /******************* 分别计算 P(Y=1|x)=exp(w.x)/(1+exp(w.x)) 和 P(Y=0|x)=1/(1+exp(w.x)) 然后取值大的作为x的分类 *******************/ cout << endl << "预测分类:" << endl; for (int i = 0; i<testDataSet.size(); i++){ double py1 = 0.0L; double py0 = 0.0L; double inner = innerWX(param, testDataSet[i]); py1 = exp(inner) / (1 + exp(inner)); py0 = 1 - py1; cout << "实例: "; for (int j = 0; j<testDataSet[i].features.size(); j++){ cout << testDataSet[i].features[j] << " "; } cout << "标记分类【" << testDataSet[i].cls << "】,"; if (py1 >= py0){ cout << "预测分类【" << 1 << "】" << endl; } else{ cout << "预测分类【" << 0 << "】" << endl; } } } private: vector<Data> dataSet; Param param; }; int main(){ Logistic logist; //logist.displayDataSet(); logist.logisticRegression(); logist.predictClass(); system("pause"); return 0; }
其中 testdata.txt,保存测试数据;
traindata.txt保存训练数据;
logistic.cpp是代码源文件。三个文件保存在同一目录下。
数据的格式如下:
10009 1 0 0 1 0 1
10025 0 0 1 2 0 0
20035 0 0 1 0 0 1
20053 1 0 0 0 0 0
每行有7个列值,第一列是一个ID号,在具体操作中,忽略该列。之后的5列,每一个都表示一个特征的取值;最后一列是分类标记(0或1)。
只有实践才能深刻理解。(实践出真知)
参考:
http://www.cnblogs.com/tornadomeet/p/3395593.html
http://www.cnblogs.com/jfcspring/p/3512356.html