zoukankan      html  css  js  c++  java
  • 机器学习(1):线性回归和逻辑回归

    线性回归,逻辑回归

    纪要:

    逻辑回归实现代码(C++):

    /*********
    
    logistic回归(c++) by 姜富春
    
    **********/
    
    #include<iostream>
    
    #include<fstream>
    
    #include<vector>
    
    #include<sstream>
    
    #include<cmath>
    
    using namespace std;
    
    struct Data{
    
        vector<int> features;
    
        int cls;
    
        Data(vector<int> f, int c) :features(f), cls(c){
    
    
    
        }
    
    };
    
    struct Param{
    
        vector<double> w;
    
        double d;
    
        Param(vector<double> w1, double d1) :w(w1), d(d1){};
    
        Param() :w(vector<double>()), d(0.0){}
    
    };
    
    class Logistic{
    
    public:
    
        Logistic(){
    
            //载入traindata文件构造dataSet;
    
            loadDataSet(dataSet);
    
            //初始化Param,w的长度与数据特征的长度相同,初值为0.0。d的初值也为0.0
    
            vector<double> pw(dataSet[0].features.size(), 0.0);
    
            Param pt(pw, 0.0);
    
            param = pt;
    
    
    
        };
    
        void loadDataSet(vector<Data>& ds, string dataFile = "./traindata.txt"){
    
            ifstream fin(dataFile.c_str());
    
            if (!fin){
    
                cout << "文件打开失败" << endl;
    
                exit(0);
    
            }
    
            while (fin){
    
                string line;
    
                getline(fin, line);
    
                if (line.size()>3){
    
                    stringstream sin(line);
    
                    int t;
    
                    sin >> t;
    
                    vector<int> fea;
    
                    while (sin){
    
                        char c = sin.peek();
    
                        if (int(c) != -1){
    
                            sin >> t;
    
                            fea.push_back(t);
    
                        }
    
    
    
                    }
    
                    int cl = fea.back();
    
                    fea.pop_back();
    
                    ds.push_back(Data(fea, cl));
    
                }
    
            }
    
        }
    
    
    
        void displayDataSet(){
    
            for (int i = 0; i<dataSet.size(); i++){
    
                for (int j = 0; j<dataSet[i].features.size(); j++){
    
                    cout << dataSet[i].features[j] << " ";
    
                }
    
                cout << " 分类:" << dataSet[i].cls;
    
                cout << endl;
    
            }
    
        }
    
        void logisticRegression(){
    
            //由目标函数为最大似然,因此最终求得的是目标函数的最大值,
    
            //因此迭代过程是梯度上升,而非梯度下降
    
            double lamda = 0.1;//梯度下降的步长
    
            double delta = 0.0001;//结束迭代的阈值
    
            //目标函数的值
    
            double objLw = Lw(param);
    
            //cout<<objLw<<endl;
    
            Param tpa(param.w, param.d);
    
            gradient(lamda);
    
            double newObjLw = Lw(param);
    
            int iter = 0;
    
            cout << "初始:" << endl;
    
            displayIterProcess(iter, objLw, newObjLw, 1);
    
            while (fabs(newObjLw - objLw)>delta || !samewb(tpa, param, delta)){
    
                objLw = newObjLw;
    
                tpa = Param(param.w, param.d);
    
                gradient(lamda);
    
                newObjLw = Lw(param);
    
                ++iter;
    
                displayIterProcess(iter, objLw, newObjLw, 5);
    
            }
    
            cout << "迭代结束共迭代" << iter << "" << endl;
    
            displayIterProcess(iter, objLw, newObjLw, 1);
    
    
    
        }
    
        bool samewb(const Param &tparam, const Param& param, double delta){
    
            for (int i = 0; i<tparam.w.size(); i++){
    
                if (fabs(tparam.w[i] - param.w[i])>delta){
    
                    return false;
    
                }
    
            }
    
            if (fabs(tparam.d - param.d)>delta){
    
                return false;
    
            }
    
            return true;
    
        }
    
        void displayIterProcess(int iter, double objLw, double newObjLw, int mod){
    
            //每mod步打印一次迭代过程
    
            if (iter%mod == 0){
    
                cout << "迭代" << iter << ":目标函数值【" << newObjLw << "】,两次迭代目标函数差值【 " << (newObjLw - objLw) << "" << endl;
    
                cout << "模型参数:";
    
                for (int i = 0; i<param.w.size(); i++){
    
                    cout << param.w[i] << " ";
    
                }
    
                cout << param.d << endl << endl;
    
            }
    
    
    
        }
    
        //梯度上升更新w和b
    
        void gradient(double lam){
    
            for (int i = 0; i<param.w.size(); i++){
    
                double tmp = 0.0L;//保存梯度上升过程的中间值
    
                for (int j = 0; j<dataSet.size(); j++){
    
                    tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*dataSet[j].features[i] * lam;
    
                }
    
                param.w[i] += (tmp);
    
            }
    
            double tmp = 0.0L;
    
            for (int j = 0; j<dataSet.size(); j++){
    
                tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*lam;
    
            }
    
            param.d += tmp;
    
    
    
        }
    
        //计算logistic函数的值,即f(x)=exp(wx)/(1+exp(wx)),该表达式在求解梯度过程中出现,
    
        //因此计算这个值是为了辅助梯度上升计算过程
    
        inline double logiFun(const Param &p, const Data &d){
    
            double inner = innerWX(p, d);
    
            double le = exp(inner) / (1 + exp(inner));
    
            return le;
    
        }
    
        //计算对数似然函数的值
    
        double Lw(Param p){
    
            double l = 0.0L;
    
            for (int i = 0; i<dataSet.size(); i++){
    
                double inner = innerWX(p, dataSet[i]);
    
                l += (dataSet[i].cls*inner - (log10(1 + exp(inner))));
    
                //cout<<"l="<<l<<endl;
    
            }
    
    
    
            return l;
    
        }
    
        //计算wx+b的值
    
        inline double innerWX(const Param &p, const Data &data){
    
            if (p.w.size() != data.features.size()){
    
                cout << "参数与实例的维度不匹配,不能进行内积计算" << endl;
    
                exit(0);
    
            }
    
            double innerP = 0.0L;
    
            for (int i = 0; i<p.w.size(); i++){
    
                innerP += (p.w[i] * data.features[i]);
    
            }
    
            innerP += p.d;
    
            return innerP;
    
        }
    
        //给定测试集,预测分类
    
        void predictClass(){
    
            vector<Data> testDataSet;
    
            loadDataSet(testDataSet, "./testdata.txt");
    
            /*******************
    
            分别计算
    
            P(Y=1|x)=exp(w.x)/(1+exp(w.x))
    
            和
    
            P(Y=0|x)=1/(1+exp(w.x))
    
            然后取值大的作为x的分类
    
            *******************/
    
            cout << endl << "预测分类:" << endl;
    
            for (int i = 0; i<testDataSet.size(); i++){
    
                double py1 = 0.0L;
    
                double py0 = 0.0L;
    
                double inner = innerWX(param, testDataSet[i]);
    
                py1 = exp(inner) / (1 + exp(inner));
    
                py0 = 1 - py1;
    
                cout << "实例: ";
    
                for (int j = 0; j<testDataSet[i].features.size(); j++){
    
                    cout << testDataSet[i].features[j] << " ";
    
                }
    
                cout << "标记分类【" << testDataSet[i].cls << "】,";
    
                if (py1 >= py0){
    
    
    
                    cout << "预测分类【" << 1 << "" << endl;
    
                }
                else{
    
                    cout << "预测分类【" << 0 << "" << endl;
    
                }
    
            }
    
        }
    
    private:
    
        vector<Data> dataSet;
    
        Param param;
    
    };
    
    int main(){
    
        Logistic logist;
    
        //logist.displayDataSet();
    
        logist.logisticRegression();
    
        logist.predictClass();
    
        system("pause");
    
        return 0;
    
    }

     其中  testdata.txt,保存测试数据;

             traindata.txt保存训练数据;

             logistic.cpp是代码源文件。三个文件保存在同一目录下。

    数据的格式如下:

       10009 1 0 0 1 0 1  

       10025  0 0 1 2 0 0  

       20035  0 0 1 0 0 1  

         20053  1 0 0 0 0 0

    每行有7个列值,第一列是一个ID号,在具体操作中,忽略该列。之后的5列,每一个都表示一个特征的取值;最后一列是分类标记(0或1)。

    只有实践才能深刻理解。(实践出真知)

     参考:

    http://www.cnblogs.com/tornadomeet/p/3395593.html

    http://www.cnblogs.com/jfcspring/p/3512356.html

  • 相关阅读:
    实习应聘总结
    SSH:远程登陆
    对HashMap进行排序
    python笔记-集合类型
    python笔记-序列类型
    python笔记-数字类型
    python笔记-变量与字符串
    python笔记-数据类型
    C#winform调用外部程序,等待外部程序执行完毕才执行下面代码
    防止查询数据返回数据行为零引起的逻辑判断重复或抛出异常
  • 原文地址:https://www.cnblogs.com/Allen-rg/p/6021255.html
Copyright © 2011-2022 走看看