zoukankan      html  css  js  c++  java
  • logistic regression C++实现

      在“逻辑斯特回归模型(logistic regression)”一文中阐述了logistic regression模型的理论基础,本节采用C++实现logistic regression。算法是由笔者跟我们老板一起合作完成(老板提供算法框架和理论上的支持,笔者编写代码,准备开源发布)。下面分数据结构、模型实现、模型训练、模型预测与usage五个部分介绍logistic regression模型的实现与用法。

      1.数据结构

    /********************************************************************
    * Logistic Regression Classifier V0.10
    * Implemented by Rui Xia(rxia@nlpr.ia.ac.cn) , Wang Tao(wangbogong@gmail.com)
    * Last updated on 2012-6-12. 
    *********************************************************************/
    #pragma once
    #include <iostream>
    #include <fstream>
    #include <iomanip>
    #include <string>
    #include <vector>
    #include <set>
    #include <map>
    #include <algorithm>
    #include <climits>
    #include <math.h>
    #include <time.h>
    
    # define VERSION       "V0.10"
    # define VERSION_DATE  "2012-6-12"
    
    using namespace std;
    
    const long  double min_threshold=1e-300;
    
    struct sparse_feat                       //稀疏特征表示结构
    {
        vector<int> id_vec;
        vector<float> value_vec;
    };
    
    class LR                                 //logistic regression实现类
    {
    private:
        vector<sparse_feat> samp_feat_vec;
        vector<int> samp_class_vec;
        int feat_set_size;
        int class_set_size;
        vector< vector<float> > omega; //模型的参数矩阵omega = feat_set_size * class_set_size
         
    public:
        LR();
        ~LR();
        void save_model(string model_file);
        void load_model(string model_file);
        void load_training_file(string training_file);
        void init_omega();
        
        int train_online(int max_loop, double loss_thrd, float learn_rate, float lambda, int avg);    //logistic regression随机梯度优化算法
        vector<float> calc_score(sparse_feat &samp_feat);
        vector<float> score_to_prb(vector<float> &score);
        int score_to_class(vector<float> &score);
        
        float classify_testing_file(string testing_file, string output_file, int output_format);      //模型分类预测
    
    private:
        void read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec);   //更新函数
        void update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda);
        void calc_loss_ce(double *loss, float *acc);                                                              //计算损失函数
        float calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec);    
        float sigmoid(float x);
        vector<string> string_split(string terms_str, string spliting_tag);
    
    };
      sparse_feat结构体表示的稀疏的存在结构,id_vec用于存放特征的id号,而value_vec用于存在对应的特征值。LR类是logistic regression 实现类,主要的两个函数train_online()与
    classify_testing_file(),train_online()函数实现了logistic regression模型的随机梯度下降优化算法,classify_testing_file()函数实现了logistic regression模型对测试样本的预测。

      2.模型的实现
    
    #include "LR.h"
    
    LR::LR()
    {
    
    }
    
    LR::~LR()
    {
    }
    
    void LR::save_model(string model_file)
    { 
        cout << "Saving model..." << endl;
        ofstream fout(model_file.c_str());
        for (int k = 0; k < feat_set_size; k++) {
            for (int j = 0; j < class_set_size; j++) {
                fout << omega[k][j] << " ";
            }
            fout << endl;
        }
        fout.close();
    }
    
    
    void LR::load_model(string model_file)
    {
        cout << "Loading model..." << endl;
        omega.clear();
        ifstream fin(model_file.c_str());
        if(!fin) {
            cerr << "Error opening file: " << model_file << endl;
            exit(0);
        }
        string line_str;
        while (getline(fin, line_str)) {
            vector<string> line_vec = string_split(line_str, " ");
            vector<float>  line_omega;
            for (vector<string>::iterator it = line_vec.begin(); it != line_vec.end(); it++) {
                float weight = (float)atof(it->c_str());
                line_omega.push_back(weight);
            }
            omega.push_back(line_omega);
        }
        fin.close();
        feat_set_size = (int)omega.size();
        class_set_size = (int)omega[0].size();
    }
    
    
    void LR::read_samp_file(string samp_file, vector<sparse_feat> &samp_feat_vec, vector<int> &samp_class_vec) {
        ifstream fin(samp_file.c_str());
        if(!fin) {
            cerr << "Error opening file: " << samp_file << endl;
            exit(0);
        }
        string line_str;
        while (getline(fin, line_str)) 
        {
            size_t class_pos = line_str.find_first_of("\t");
            int class_id = atoi(line_str.substr(0, class_pos).c_str());
            samp_class_vec.push_back(class_id);
            string terms_str = line_str.substr(class_pos+1);
            sparse_feat samp_feat;
            samp_feat.id_vec.push_back(0); // bias
            samp_feat.value_vec.push_back(1); // bias
            if (terms_str != "") 
            {
                vector<string> fv_vec = string_split(terms_str, " ");
                for (vector<string>::iterator it = fv_vec.begin(); it != fv_vec.end(); it++) 
                {
                    size_t feat_pos = it->find_first_of(":");
                    int feat_id = atoi(it->substr(0, feat_pos).c_str());
                    float feat_value = (float)atof(it->substr(feat_pos+1).c_str());
                    samp_feat.id_vec.push_back(feat_id);
                    samp_feat.value_vec.push_back(feat_value);
                }
            }
            samp_feat_vec.push_back(samp_feat);
        }
        fin.close();
    }
    
    
    void LR::load_training_file(string training_file)
    {
        cout << "Loading training data..." << endl;
        read_samp_file(training_file, samp_feat_vec, samp_class_vec);
        feat_set_size = 0;
        class_set_size = 0;
        for (size_t i = 0; i < samp_class_vec.size(); i++) {
            if (samp_class_vec[i] > class_set_size) {
                class_set_size = samp_class_vec[i];
            }
            if (samp_feat_vec[i].id_vec.back() > feat_set_size) {
                feat_set_size = samp_feat_vec[i].id_vec.back();
            }    
        }
        class_set_size += 1;
        feat_set_size += 1;
    }
    
    void LR::init_omega()
    {
         float init_value = 0.0;
        //float init_value = (float)1/class_set_size;
        for (int i = 0; i < feat_set_size; i++) 
        {
            vector<float> temp_vec(class_set_size, init_value);
            omega.push_back(temp_vec);
        }
    }
    
    // Stochastic Gradient Descent (SGD) optimization for the criteria of  Cross Entropy (CE)
    int LR::train_online( int max_loop, double loss_thrd, float learn_rate, float lambda, int avg)
    {
        int id = 0;
        double loss = 0.0;
        double loss_pre = 0.0;
        vector< vector<float> > omega_pre=omega;
        float acc=0.0;
    
        vector< vector<float> > omega_sum(omega);
        
        while (id <= max_loop*(int)samp_class_vec.size()) 
        {
        
            
            if (id%samp_class_vec.size() == 0)    // 完成一次迭代,预处理工作。
            {
                int loop = id/(int)samp_class_vec.size();               //check loss
                loss = 0.0;
                acc = 0.0;
                
                calc_loss_ce(&loss, &acc);    
                cout.setf(ios::left);
                cout << "Iter: " << setw(8) << loop << "Loss: " << setw(18) << loss << "Acc: " << setw(8) << acc << endl;
                if ((loss_pre - loss) < loss_thrd && loss_pre >= loss && id != 0)
                {
                    cout << "Reaching the minimal loss value decrease!" << endl;
                    break;
                }
                loss_pre = loss;
                                    
                if (id)            //表示第一次不做正则项计算
                {
                    for (int i=0;i!=omega_pre.size();i++)
                        for (int j=0;j!=omega_pre[i].size();j++)
                            omega[i][j]+=omega_pre[i][j]*lambda;
                }
    
                omega_pre=omega;
            }
    
            // update omega
            int r = (int)(rand()%samp_class_vec.size());
            sparse_feat samp_feat = samp_feat_vec[r];
            int samp_class = samp_class_vec[r];
        
             
           update_online_ce(samp_class, samp_feat, learn_rate, lambda);
            
            if (avg == 1 && id%samp_class_vec.size() == 0) 
            {
                for (int i = 0; i < feat_set_size; i++) 
                {
                    for (int j = 0; j < class_set_size; j++) 
                    {
                        omega_sum[i][j] += omega[i][j];
                    }
                }            
            }
            id++;
        }
    
        if (avg == 1) 
        {
            for (int i = 0; i < feat_set_size; i++) 
            {
                for (int j = 0; j < class_set_size; j++)
                {
                    omega[i][j] = (float)omega_sum[i][j] / id;
                }
            }        
        }
        return 0;
    }
    
    void LR::update_online_ce(int samp_class, sparse_feat &samp_feat, float learn_rate, float lambda)
    {
        
        vector<float> score=calc_score(samp_feat);//(W'*X)
        vector<float> softMaxVec(class_set_size);
    
        float maxScore=*(max_element(score.begin(),score.end()));
        float softMaxSum=0;
    
        for (int j=0;j<class_set_size;j++)
        {
            softMaxVec[j]=exp(score[j]-maxScore);
            softMaxSum+=softMaxVec[j];                             //同时除最大的score;
        }
        for (int k=0;k<class_set_size;k++)
            softMaxVec[k]/=softMaxSum;
    
    
        for (int i=0;i<class_set_size;i++)                          //对于每一个类
        {
            float error_term=((int)(i==samp_class)-softMaxVec[i]);
            for (int j=0;j<samp_feat.id_vec.size();j++)             //对于每个类中的
            {
                int feat_id=samp_feat.id_vec[j];
                float feat_value=samp_feat.value_vec[j];
                float delt=error_term*feat_value;
                //float regul = lambda * omega[feat_id][i];
                omega[feat_id][i]+=learn_rate*delt;
    
            }
    
        }
        
         
    }
    
    void LR::calc_loss_ce(double *loss, float *acc)
    {
        double loss_value = 0.0;
        int err_num = 0;
    
        for (size_t i = 0; i < samp_class_vec.size(); i++) 
        {
            int samp_class = samp_class_vec[i];
            sparse_feat samp_feat = samp_feat_vec[i];
    
            vector<float> score = calc_score(samp_feat);
            vector<float> softMaxVec(class_set_size);
            float softMaxSum=0;
    
            int pred_class = score_to_class(score);
            if (pred_class != samp_class) 
            {
                err_num += 1;
            }
    
            float maxScore=*(max_element(score.begin(),score.end()));
            for (int k=0;k<class_set_size;k++)
            {
               softMaxVec[k]=exp(score[k]-maxScore);
               softMaxSum+=softMaxVec[k];                     //同时除最大的score;
            }
    
            for (int j = 0; j < class_set_size; j++)
            {
                if (j == samp_class) 
                { 
    
                    double yi=softMaxVec[j]/softMaxSum;
                    long double temp=yi<min_threshold ? min_threshold:yi;
                    loss_value -= log(temp);                             
            
                }
    
    
            }
        }
    
        *loss = loss_value ;
        *acc = 1 - (float)err_num / samp_class_vec.size();
    }
    
    vector<float> LR::calc_score(sparse_feat &samp_feat)
    {
        vector<float> score(class_set_size, 0);
        for (int j = 0; j < class_set_size; j++)
        {
            for (size_t k = 0; k < samp_feat.id_vec.size(); k++) 
            {
                int feat_id = samp_feat.id_vec[k];
                float feat_value = samp_feat.value_vec[k];
                score[j] += omega[feat_id][j] * feat_value;
            }
        }
        return score;
    }
    
    vector<float> LR::score_to_prb(vector<float> &score)
    {   
        //vector<float> prb(class_set_size, 0);
        //for (int i = 0; i < class_set_size; i++) 
        //{
        //    float delta_prb_sum = 0.0;
        //    for (int j = 0; j < class_set_size; j++) 
        //    {
        //        delta_prb_sum += exp(score[j] - score[i]);
        //    }
        //    prb[i] = 1 / delta_prb_sum;
        //}
        //return prb;
    
        vector<float> softMaxVec(class_set_size);
    
        float maxScore=*(max_element(score.begin(),score.end()));
        float softMaxSum=0;
    
        for (int j=0;j<class_set_size;j++)
        {
            softMaxVec[j]=exp(score[j]-maxScore);
            softMaxSum+=softMaxVec[j];                             //同时除最大的score;
        }
        for (int k=0;k<class_set_size;k++)
            softMaxVec[k]/=softMaxSum;
    
        return softMaxVec;
    
    }
    
    
    int LR::score_to_class(vector<float> &score)
    {
        int pred_class = 0;    
        float max_score = score[0];
        for (int j = 1; j < class_set_size; j++) {
            if (score[j] > max_score) {
                max_score = score[j];
                pred_class = j;
            }
        }
        return pred_class;
    }
    
    float LR::classify_testing_file(string testing_file, string output_file, int output_format)
    {
        cout << "Classifying testing file..." << endl;
        vector<sparse_feat> test_feat_vec;
        vector<int> test_class_vec;
        vector<int> pred_class_vec;
        read_samp_file(testing_file, test_feat_vec, test_class_vec);
        ofstream fout(output_file.c_str());
        for (size_t i = 0; i < test_class_vec.size(); i++) 
        {
            int samp_class = test_class_vec[i];
            sparse_feat samp_feat = test_feat_vec[i];
            vector<float> pred_score = calc_score(samp_feat);            
            int pred_class = score_to_class(pred_score);
            pred_class_vec.push_back(pred_class);
            fout << pred_class << "\t"<<samp_class<<"\t";
            if (output_format == 1) 
            {
                for (int j = 0; j < class_set_size; j++) 
                {
                    fout << j << ":" << pred_score[j] << ' '; 
                }        
            }
            else if (output_format == 2) 
            {
                vector<float> pred_prb = score_to_prb(pred_score);
                for (int j = 0; j < class_set_size; j++)
                {
                    fout << j << ":" << pred_prb[j] << ' '; 
                }
            }
    
            fout << endl;        
        }
        fout.close();
        float acc = calc_acc(test_class_vec, pred_class_vec);
        return acc;
    }
    
    float LR::calc_acc(vector<int> &test_class_vec, vector<int> &pred_class_vec)
    {
        size_t len = test_class_vec.size();
        if (len != pred_class_vec.size()) {
            cerr << "Error: two vectors should have the same lenght." << endl;
            exit(0);
        }
        int err_num = 0;
        for (size_t id = 0; id != len; id++) {
            if (test_class_vec[id] != pred_class_vec[id]) {
                err_num++;
            }
        }
        return 1 - ((float)err_num) / len;
    }
    
    float LR::sigmoid(float x) 
    {
        double sgm = 1 / (1+exp(-(double)x));
        return (float)sgm;
    }
    
    vector<string> LR::string_split(string terms_str, string spliting_tag)
    {
        vector<string> feat_vec;
        size_t term_beg_pos = 0;
        size_t term_end_pos = 0;
        while ((term_end_pos = terms_str.find_first_of(spliting_tag, term_beg_pos)) != string::npos) 
        {
            if (term_end_pos > term_beg_pos)
            {
                string term_str = terms_str.substr(term_beg_pos, term_end_pos - term_beg_pos);
                feat_vec.push_back(term_str);
            }
            term_beg_pos = term_end_pos + 1;
        }
        if (term_beg_pos < terms_str.size())
        {
            string end_str = terms_str.substr(term_beg_pos);
            feat_vec.push_back(end_str);
        }
        return feat_vec;
    }

      3.模型训练代码

    #include <cstdlib>
    #include <iostream>
    #include <cstring>
    
    #include "LR.h"
    
    using namespace std;
    
    
    void print_help() 
    {
        cout << "\nOpenPR-LR training module, " << VERSION << ", " << VERSION_DATE << "\n\n"
            << "usage: LR_train [options] training_file model_file [pre_model_file]\n\n"
            << "options: -h        -> help\n"
            << "         -n int    -> maximal iteration loops (default 200)\n"
            << "         -m double -> minimal loss value decrease (default 1e-03)\n"
            << "         -r double -> regularization parameter lambda of gaussian prior (default 0)\n"        
            << "         -l float  -> learning rate (default 1.0)\n"
            << "         -a        -> 0: final weight (default)\n"
            << "                   -> 1: average weights of all iteration loops\n"
            << "         -u [0,1]  -> 0: initial training model (default)\n"
            << "                   -> 1: updating model (pre_model_file is needed)\n" 
            << endl;
    }
    
    void read_parameters(int argc, char *argv[], char *training_file, char *model_file, 
                         int *max_loop, double *loss_thrd, float *learn_rate, float *lambda,
                            int *avg, int *update, char *pre_model_file)
    {
        // set default options
        *max_loop = 200;
        *loss_thrd = 1e-3;
        *learn_rate = 1.0;
        *lambda = 0.0;
        *avg = 0;
        *update = 0;
        int i;
        for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++) 
        {
            switch ((argv[i])[1]) {
                case 'h':
                    print_help();
                    exit(0);
                case 'n':
                    *max_loop = atoi(argv[++i]);
                    break;
                case 'm':
                    *loss_thrd = atof(argv[++i]);
                    break;
                case 'l':
                    *learn_rate = (float)atof(argv[++i]);
                    break;
                case 'r':
                    *lambda = (float)atof(argv[++i]);
                    break;
                case 'a':
                    *avg = atoi(argv[++i]);
                    break;
                case 'u':
                    *update = atoi(argv[++i]);
                    break;
                default:
                    cout << "Unrecognized option: " << argv[i] << "!" << endl;
                    print_help();
                    exit(0);
            }
        }
        
        if ((i+1)>=argc) 
        {
            cout << "Not enough parameters!" << endl;
            print_help();
            exit(0);
        }
    
        strcpy (training_file, argv[i]);
        strcpy (model_file, argv[i+1]);
        if (*update) 
        {
            if ((i+2)>=argc) 
            {
                cout << "Previous model file is needed in update mode!" << endl;
                print_help();
                exit(0);
            }
            strcpy (pre_model_file, argv[i+2]);
        }
    }
    
    int LR_train(int argc, char *argv[])
    {
        char training_file[200];
        char model_file[200];
        int max_loop;
        double loss_thrd;
        float learn_rate;
        float lambda;
        int avg;
        int update;
        char pre_model_file[200];
        read_parameters(argc, argv, training_file, model_file, &max_loop, &loss_thrd, &learn_rate, &lambda, &avg, &update, pre_model_file);
        
        LR LR;
        LR.load_training_file(training_file);
        if (update) 
        {
            LR.load_model(pre_model_file);
        }
        else 
        {
            LR.init_omega();    
        }
        LR.train_online( max_loop, loss_thrd, learn_rate, lambda, avg);
        LR.save_model(model_file);
        return 0;
    }
    
    int main(int argc, char *argv[])
    {
        return LR_train(argc, argv);
    }

      4.模型预测

    #include <cstdlib>
    #include <iostream>
    #include <cstring>
    #include "LR.h"
    
    using namespace std;
    
    
    void print_help()
    {
        cout << "\nOpenPR-LR classification module, " << VERSION << ", " << VERSION_DATE << "\n\n"
            << "usage: LR_classify [options] testing_file model_file output_file\n\n"
            << "options: -h        -> help\n"
            << "         -f [0..2] -> 0: only output class label (default)\n"
            << "                   -> 1: output class label with log-likelihood (weighted sum)\n"
            << "                   -> 2: output class label with soft probability\n"
            << endl;
    }
    
    void read_parameters(int argc, char *argv[], char *testing_file, char *model_file, 
                            char *output_file, int *output_format) 
    {
        // set default options
        *output_format = 0;
        int i;
        for (i = 1; (i<argc) && (argv[i])[0]=='-'; i++)
        {
            switch ((argv[i])[1]) {
                case 'h':
                    print_help();
                    exit(0);
                case 'f':
                    *output_format = atoi(argv[++i]);
                    break;
                default:
                    cout << "Unrecognized option: " << argv[i] << "!" << endl;
                    print_help();
                    exit(0);
            }
        }
        
        if ((i+2)>=argc)
        {
            cout << "Not enough parameters!" << endl;
            print_help();
            exit(0);
        }
        strcpy(testing_file, argv[i]);
        strcpy(model_file, argv[i+1]);
        strcpy(output_file, argv[i+2]);
    }
    
    int LR_classify(int argc, char *argv[])
    {
        char testing_file[200];
        char model_file[200];
        char output_file[200];
        int output_format;
        read_parameters(argc, argv, testing_file, model_file, output_file, &output_format);
        LR LR;
        LR.load_model(model_file);
        float acc = LR.classify_testing_file(testing_file, output_file, output_format);
        cout << "Accuracy: " << acc << endl;
        //ofstream outfile("d:\\result.txt",ios::app);
        //outfile<<testing_file<<"\t"<<acc<<endl;
        return 0;
    }
    
    int main(int argc, char *argv[])
    {
        return LR_classify(argc, argv);
    }

      5.usage

      算法的处理样本的数据格式与libsvm样本数据格式一致,均采用稀疏的存储结构。

      模型训练:> lr_train -n 50 -m 1e-06  data/train.samp result/ldf.mod。训练样本保存在data/train.samp文件,模型训练迭代的最大次数是50,最小损失值是1e-06,模型文件存储在result/ldf.mod文件。

      分类预测:> lr_classify data/test.samp data/ldf.mod result/ldf.out。测试样本存放在data/test.samp文件,样本输出的结果文件存在在result/ldf.out文件。

     (ps:这里是logisitc regerssion的工具包,包括了说明了文档,源代码和vs2010编译的可执行程序)

  • 相关阅读:
    1:4 UI标签和通用标签
    1:3访问 servlet API 的两种方式(request,session等内置对象)
    1 :2 Strust2—Demo
    1:1 Struts2概述
    mysql索引原理与慢查询优化1
    mysql流程控制
    mysql函数
    mysql存储过程
    mysql事务
    mysql触发器
  • 原文地址:https://www.cnblogs.com/wangbogong/p/3131838.html
Copyright © 2011-2022 走看看