在“逻辑斯特回归模型(logistic regression)”一文中阐述了logistic regression模型的理论基础,本节采用C++实现logistic regression。算法是由笔者跟我们老板一起合作完成(老板提供算法框架和理论上的支持,笔者编写代码,准备开源发布)。下面分数据结构、模型实现、模型训练、模型预测与usage五个部分介绍logistic regression模型的实现与用法。
1.数据结构
/******************************************************************** * Logistic Regression Classifier V0.10 * Implemented by Rui Xia(rxia@nlpr.ia.ac.cn) , Wang Tao(wangbogong@gmail.com) * Last updated on 2012-6-12. *********************************************************************/ #pragma once #include <iostream> #include <fstream> #include <iomanip> #include <string> #include <vector> #include <set> #include <map> #include <algorithm> #include <climits> #include <math.h> #include <time.h> # define VERSION "V0.10" # define VERSION_DATE "2012-6-12" using namespace std; const long double min_threshold=1e-300; struct sparse_feat //稀疏特征表示结构 { vector<int> id_vec; vector<float> value_vec; }; class LR //logistic regression实现类 { private: vector<sparse_feat> samp_feat_vec; vector<int> samp_class_vec; int feat_set_size; int class_set_size; vector< vector<float> > omega; //模型的参数矩阵omega = feat_set_size * class_set_size public: LR(); ~LR(); void save_model(string model_file); void load_model(string model_file); void load_training_file(string training_file); void init_omega(); int train_online(int max_loop, double loss_thrd, float learn_rate, float lambda, int avg); //logistic regression随机梯度优化算法 vector<float> calc_score(sparse_feat &samp_feat); vector<float> score_to_prb(vector<float> &score); int score_to_class(vector<float> &score); float classify_testing_file(string testing_file, string output_file, int output_format); //模型分类预测 private: void read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec); //更新函数 void update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda); void calc_loss_ce(double *loss, float *acc); //计算损失函数 float calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec); float sigmoid(float x); vector<string> string_split(string terms_str, string spliting_tag); };
sparse_feat结构体表示的稀疏的存在结构,id_vec用于存放特征的id号,而value_vec用于存在对应的特征值。LR类是logistic regression 实现类,主要的两个函数train_online()与
classify_testing_file(),train_online()函数实现了logistic regression模型的随机梯度下降优化算法,classify_testing_file()函数实现了logistic regression模型对测试样本的预测。
2.模型的实现
#include "LR.h" LR::LR() { } LR::~LR() { } void LR::save_model(string model_file) { cout << "Saving model..." << endl; ofstream fout(model_file.c_str()); for (int k = 0; k < feat_set_size; k++) { for (int j = 0; j < class_set_size; j++) { fout << omega[k][j] << " "; } fout << endl; } fout.close(); } void LR::load_model(string model_file) { cout << "Loading model..." << endl; omega.clear(); ifstream fin(model_file.c_str()); if(!fin) { cerr << "Error opening file: " << model_file << endl; exit(0); } string line_str; while (getline(fin, line_str)) { vector<string> line_vec = string_split(line_str, " "); vector<float> line_omega; for (vector<string>::iterator it = line_vec.begin(); it != line_vec.end(); it++) { float weight = (float)atof(it->c_str()); line_omega.push_back(weight); } omega.push_back(line_omega); } fin.close(); feat_set_size = (int)omega.size(); class_set_size = (int)omega[0].size(); } void LR::read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec) { ifstream fin(samp_file.c_str()); if(!fin) { cerr << "Error opening file: " << samp_file << endl; exit(0); } string line_str; while (getline(fin, line_str)) { size_t class_pos = line_str.find_first_of("\t"); int class_id = atoi(line_str.substr(0, class_pos).c_str()); samp_class_vec.push_back(class_id); string terms_str = line_str.substr(class_pos+1); sparse_feat samp_feat; samp_feat.id_vec.push_back(0); // bias samp_feat.value_vec.push_back(1); // bias if (terms_str != "") { vector<string> fv_vec = string_split(terms_str, " "); for (vector<string>::iterator it = fv_vec.begin(); it != fv_vec.end(); it++) { size_t feat_pos = it->find_first_of(":"); int feat_id = atoi(it->substr(0, feat_pos).c_str()); float feat_value = (float)atof(it->substr(feat_pos+1).c_str()); samp_feat.id_vec.push_back(feat_id); samp_feat.value_vec.push_back(feat_value); } } samp_feat_vec.push_back(samp_feat); } fin.close(); } void LR::load_training_file(string training_file) { cout << "Loading training data..." << endl; read_samp_file(training_file, samp_feat_vec, samp_class_vec); feat_set_size = 0; class_set_size = 0; for (size_t i = 0; i < samp_class_vec.size(); i++) { if (samp_class_vec[i] > class_set_size) { class_set_size = samp_class_vec[i]; } if (samp_feat_vec[i].id_vec.back() > feat_set_size) { feat_set_size = samp_feat_vec[i].id_vec.back(); } } class_set_size += 1; feat_set_size += 1; } void LR::init_omega() { float init_value = 0.0; //float init_value = (float)1/class_set_size; for (int i = 0; i < feat_set_size; i++) { vector<float> temp_vec(class_set_size, init_value); omega.push_back(temp_vec); } } // Stochastic Gradient Descent (SGD) optimization for the criteria of Cross Entropy (CE) int LR::train_online( int max_loop, double loss_thrd, float learn_rate, float lambda, int avg) { int id = 0; double loss = 0.0; double loss_pre = 0.0; vector< vector<float> > omega_pre=omega; float acc=0.0; vector< vector<float> > omega_sum(omega); while (id <= max_loop*(int)samp_class_vec.size()) { if (id%samp_class_vec.size() == 0) // 完成一次迭代,预处理工作。 { int loop = id/(int)samp_class_vec.size(); //check loss loss = 0.0; acc = 0.0; calc_loss_ce(&loss, &acc); cout.setf(ios::left); cout << "Iter: " << setw(8) << loop << "Loss: " << setw(18) << loss << "Acc: " << setw(8) << acc << endl; if ((loss_pre - loss) < loss_thrd && loss_pre >= loss && id != 0) { cout << "Reaching the minimal loss value decrease!" << endl; break; } loss_pre = loss; if (id) //表示第一次不做正则项计算 { for (int i=0;i!=omega_pre.size();i++) for (int j=0;j!=omega_pre[i].size();j++) omega[i][j]+=omega_pre[i][j]*lambda; } omega_pre=omega; } // update omega int r = (int)(rand()%samp_class_vec.size()); sparse_feat samp_feat = samp_feat_vec[r]; int samp_class = samp_class_vec[r]; update_online_ce(samp_class, samp_feat, learn_rate, lambda); if (avg == 1 && id%samp_class_vec.size() == 0) { for (int i = 0; i < feat_set_size; i++) { for (int j = 0; j < class_set_size; j++) { omega_sum[i][j] += omega[i][j]; } } } id++; } if (avg == 1) { for (int i = 0; i < feat_set_size; i++) { for (int j = 0; j < class_set_size; j++) { omega[i][j] = (float)omega_sum[i][j] / id; } } } return 0; } void LR::update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda) { vector<float> score=calc_score(samp_feat);//(W'*X) vector<float> softMaxVec(class_set_size); float maxScore=*(max_element(score.begin(),score.end())); float softMaxSum=0; for (int j=0;j<class_set_size;j++) { softMaxVec[j]=exp(score[j]-maxScore); softMaxSum+=softMaxVec[j]; //同时除最大的score; } for (int k=0;k<class_set_size;k++) softMaxVec[k]/=softMaxSum; for (int i=0;i<class_set_size;i++) //对于每一个类 { float error_term=((int)(i==samp_class)-softMaxVec[i]); for (int j=0;j<samp_feat.id_vec.size();j++) //对于每个类中的 { int feat_id=samp_feat.id_vec[j]; float feat_value=samp_feat.value_vec[j]; float delt=error_term*feat_value; //float regul = lambda * omega[feat_id][i]; omega[feat_id][i]+=learn_rate*delt; } } } void LR::calc_loss_ce(double *loss, float *acc) { double loss_value = 0.0; int err_num = 0; for (size_t i = 0; i < samp_class_vec.size(); i++) { int samp_class = samp_class_vec[i]; sparse_feat samp_feat = samp_feat_vec[i]; vector<float> score = calc_score(samp_feat); vector<float> softMaxVec(class_set_size); float softMaxSum=0; int pred_class = score_to_class(score); if (pred_class != samp_class) { err_num += 1; } float maxScore=*(max_element(score.begin(),score.end())); for (int k=0;k<class_set_size;k++) { softMaxVec[k]=exp(score[k]-maxScore); softMaxSum+=softMaxVec[k]; //同时除最大的score; } for (int j = 0; j < class_set_size; j++) { if (j == samp_class) { double yi=softMaxVec[j]/softMaxSum; long double temp=yi<min_threshold ? min_threshold:yi; loss_value -= log(temp); } } } *loss = loss_value ; *acc = 1 - (float)err_num / samp_class_vec.size(); } vector<float> LR::calc_score(sparse_feat &samp_feat) { vector<float> score(class_set_size, 0); for (int j = 0; j < class_set_size; j++) { for (size_t k = 0; k < samp_feat.id_vec.size(); k++) { int feat_id = samp_feat.id_vec[k]; float feat_value = samp_feat.value_vec[k]; score[j] += omega[feat_id][j] * feat_value; } } return score; } vector<float> LR::score_to_prb(vector<float> &score) { //vector<float> prb(class_set_size, 0); //for (int i = 0; i < class_set_size; i++) //{ // float delta_prb_sum = 0.0; // for (int j = 0; j < class_set_size; j++) // { // delta_prb_sum += exp(score[j] - score[i]); // } // prb[i] = 1 / delta_prb_sum; //} //return prb; vector<float> softMaxVec(class_set_size); float maxScore=*(max_element(score.begin(),score.end())); float softMaxSum=0; for (int j=0;j<class_set_size;j++) { softMaxVec[j]=exp(score[j]-maxScore); softMaxSum+=softMaxVec[j]; //同时除最大的score; } for (int k=0;k<class_set_size;k++) softMaxVec[k]/=softMaxSum; return softMaxVec; } int LR::score_to_class(vector<float> &score) { int pred_class = 0; float max_score = score[0]; for (int j = 1; j < class_set_size; j++) { if (score[j] > max_score) { max_score = score[j]; pred_class = j; } } return pred_class; } float LR::classify_testing_file(string testing_file, string output_file, int output_format) { cout << "Classifying testing file..." << endl; vector<sparse_feat> test_feat_vec; vector<int> test_class_vec; vector<int> pred_class_vec; read_samp_file(testing_file, test_feat_vec, test_class_vec); ofstream fout(output_file.c_str()); for (size_t i = 0; i < test_class_vec.size(); i++) { int samp_class = test_class_vec[i]; sparse_feat samp_feat = test_feat_vec[i]; vector<float> pred_score = calc_score(samp_feat); int pred_class = score_to_class(pred_score); pred_class_vec.push_back(pred_class); fout << pred_class << "\t"<<samp_class<<"\t"; if (output_format == 1) { for (int j = 0; j < class_set_size; j++) { fout << j << ":" << pred_score[j] << ' '; } } else if (output_format == 2) { vector<float> pred_prb = score_to_prb(pred_score); for (int j = 0; j < class_set_size; j++) { fout << j << ":" << pred_prb[j] << ' '; } } fout << endl; } fout.close(); float acc = calc_acc(test_class_vec, pred_class_vec); return acc; } float LR::calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec) { size_t len = test_class_vec.size(); if (len != pred_class_vec.size()) { cerr << "Error: two vectors should have the same lenght." << endl; exit(0); } int err_num = 0; for (size_t id = 0; id != len; id++) { if (test_class_vec[id] != pred_class_vec[id]) { err_num++; } } return 1 - ((float)err_num) / len; } float LR::sigmoid(float x) { double sgm = 1 / (1+exp(-(double)x)); return (float)sgm; } vector<string> LR::string_split(string terms_str, string spliting_tag) { vector<string> feat_vec; size_t term_beg_pos = 0; size_t term_end_pos = 0; while ((term_end_pos = terms_str.find_first_of(spliting_tag, term_beg_pos)) != string::npos) { if (term_end_pos > term_beg_pos) { string term_str = terms_str.substr(term_beg_pos, term_end_pos - term_beg_pos); feat_vec.push_back(term_str); } term_beg_pos = term_end_pos + 1; } if (term_beg_pos < terms_str.size()) { string end_str = terms_str.substr(term_beg_pos); feat_vec.push_back(end_str); } return feat_vec; }
3.模型训练代码
#include <cstdlib> #include <iostream> #include <cstring> #include "LR.h" using namespace std; void print_help() { cout << "\nOpenPR-LR training module, " << VERSION << ", " << VERSION_DATE << "\n\n" << "usage: LR_train [options] training_file model_file [pre_model_file]\n\n" << "options: -h -> help\n" << " -n int -> maximal iteration loops (default 200)\n" << " -m double -> minimal loss value decrease (default 1e-03)\n" << " -r double -> regularization parameter lambda of gaussian prior (default 0)\n" << " -l float -> learning rate (default 1.0)\n" << " -a -> 0: final weight (default)\n" << " -> 1: average weights of all iteration loops\n" << " -u [0,1] -> 0: initial training model (default)\n" << " -> 1: updating model (pre_model_file is needed)\n" << endl; } void read_parameters(int argc, char *argv[], char *training_file, char *model_file, int *max_loop, double *loss_thrd, float *learn_rate, float *lambda, int *avg, int *update, char *pre_model_file) { // set default options *max_loop = 200; *loss_thrd = 1e-3; *learn_rate = 1.0; *lambda = 0.0; *avg = 0; *update = 0; int i; for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) { switch ((argv[i])[1]) { case 'h': print_help(); exit(0); case 'n': *max_loop = atoi(argv[++i]); break; case 'm': *loss_thrd = atof(argv[++i]); break; case 'l': *learn_rate = (float)atof(argv[++i]); break; case 'r': *lambda = (float)atof(argv[++i]); break; case 'a': *avg = atoi(argv[++i]); break; case 'u': *update = atoi(argv[++i]); break; default: cout << "Unrecognized option: " << argv[i] << "!" << endl; print_help(); exit(0); } } if ((i+1)>=argc) { cout << "Not enough parameters!" << endl; print_help(); exit(0); } strcpy (training_file, argv[i]); strcpy (model_file, argv[i+1]); if (*update) { if ((i+2)>=argc) { cout << "Previous model file is needed in update mode!" << endl; print_help(); exit(0); } strcpy (pre_model_file, argv[i+2]); } } int LR_train(int argc, char *argv[]) { char training_file[200]; char model_file[200]; int max_loop; double loss_thrd; float learn_rate; float lambda; int avg; int update; char pre_model_file[200]; read_parameters(argc, argv, training_file, model_file, &max_loop, &loss_thrd, &learn_rate, &lambda, &avg, &update, pre_model_file); LR LR; LR.load_training_file(training_file); if (update) { LR.load_model(pre_model_file); } else { LR.init_omega(); } LR.train_online( max_loop, loss_thrd, learn_rate, lambda, avg); LR.save_model(model_file); return 0; } int main(int argc, char *argv[]) { return LR_train(argc, argv); }
4.模型预测
#include <cstdlib> #include <iostream> #include <cstring> #include "LR.h" using namespace std; void print_help() { cout << "\nOpenPR-LR classification module, " << VERSION << ", " << VERSION_DATE << "\n\n" << "usage: LR_classify [options] testing_file model_file output_file\n\n" << "options: -h -> help\n" << " -f [0..2] -> 0: only output class label (default)\n" << " -> 1: output class label with log-likelihood (weighted sum)\n" << " -> 2: output class label with soft probability\n" << endl; } void read_parameters(int argc, char *argv[], char *testing_file, char *model_file, char *output_file, int *output_format) { // set default options *output_format = 0; int i; for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) { switch ((argv[i])[1]) { case 'h': print_help(); exit(0); case 'f': *output_format = atoi(argv[++i]); break; default: cout << "Unrecognized option: " << argv[i] << "!" << endl; print_help(); exit(0); } } if ((i+2)>=argc) { cout << "Not enough parameters!" << endl; print_help(); exit(0); } strcpy(testing_file, argv[i]); strcpy(model_file, argv[i+1]); strcpy(output_file, argv[i+2]); } int LR_classify(int argc, char *argv[]) { char testing_file[200]; char model_file[200]; char output_file[200]; int output_format; read_parameters(argc, argv, testing_file, model_file, output_file, &output_format); LR LR; LR.load_model(model_file); float acc = LR.classify_testing_file(testing_file, output_file, output_format); cout << "Accuracy: " << acc << endl; //ofstream outfile("d:\\result.txt",ios::app); //outfile<<testing_file<<"\t"<<acc<<endl; return 0; } int main(int argc, char *argv[]) { return LR_classify(argc, argv); }
5.usage
算法的处理样本的数据格式与libsvm样本数据格式一致,均采用稀疏的存储结构。
模型训练:> lr_train -n 50 -m 1e-06 data/train.samp result/ldf.mod。训练样本保存在data/train.samp文件,模型训练迭代的最大次数是50,最小损失值是1e-06,模型文件存储在result/ldf.mod文件。
分类预测:> lr_classify data/test.samp data/ldf.mod result/ldf.out。测试样本存放在data/test.samp文件,样本输出的结果文件存在在result/ldf.out文件。
(ps:这里是logisitc regerssion的工具包,包括了说明了文档,源代码和vs2010编译的可执行程序)