zoukankan      html  css  js  c++  java
  • 《机器学习实战》Logistic回归


    注释:Ng的视频有完整的推到步骤,不过理论和实践还是有很大差别的,代码实现还得完成

    1.Logistic回归理论

      http://www.cnblogs.com/wjy-lulu/p/7759515.html,Ng的推导很完美,看懂就可以了,没必要自己推导一遍,因为几天不用就忘记 了。

    2.代码实现

      2.1全局梯度上升

        每次训练针对整体,依据整体去找最值。

        好处:容易过滤局部极值,找到真正的全局极值。

        坏处:整体数据太多,花费时间太久,而且新来的样本必须重新训练。

        推倒公式:见博文刚开始的链接,Ng大神的全部推导及证明!

     1 def loadDataSet():
     2     dataMat  = []
     3     labelMat = []
     4     fr = open('testSet.txt')
     5     for line in fr.readlines():
     6         lineArr = line.strip().split()#分割空格
     7         #改变存储data:[[a,b],[c,d]]/
     8         #       labels:[1,0,0,1...]
     9         dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
    10         labelMat.append([int(lineArr[2])])
    11     return dataMat, labelMat
    12 def sigmoid(intX):
    13     return 1.0/(1.0+np.exp(-intX))
    14 #全局梯度上升法
    15 def gradAscent(dataMatIn,classLabels):
    16     dataMatrix = np.mat(dataMatIn)
    17     labelsMat = np.mat(classLabels)
    18     m, n = dataMatrix.shape
    19     alpha = 0.001
    20     maxCycle = 200
    21     weight = np.ones((n,1))#这里为了简单写,把b也当作一个w了
    22     for k in range(maxCycle):
    23         h = sigmoid(dataMatrix*weight)
    24         error = labelsMat - np.mat(h)
    25         weight = weight + alpha*dataMatrix.transpose()*error
    26     return weight

      2.1简单分类可视化

        利用matplotlib画出简单分类的决策边界

        注意:这里plot转化为list之后绘制的,看网上说可以直接用matrix,但是我运行出错。

     1 def plotBestFit(weight):
     2     dataMat, labelMat = loadDataSet()
     3     dataArr = np.array(dataMat)#转化为数组
     4     n = dataArr.shape[0]
     5     xcode1=[];ycode1=[]
     6     xcode2=[];ycode2=[]
     7     for i in range(n):
     8         if int(labelMat[i][0])==1:
     9             xcode1.append(dataArr[i,1])
    10             ycode1.append(dataArr[i,2])
    11         else:
    12             xcode2.append(dataArr[i,1])
    13             ycode2.append(dataArr[i,2])
    14     fig = plt.figure("data_x_y")
    15     ax  = fig.add_subplot(111)
    16     ax.scatter(xcode1,ycode1,s=30,c='r',marker='s')
    17     ax.scatter(xcode2,ycode2,s=30,c='g')
    18     x = np.mat(np.arange(-3.0,3.0,0.1))
    19     y = (-weight[0]-weight[1]*x)/weight[2]
    20     ax.plot(x.tolist()[0],y.tolist()[0])
    21     plt.xlabel('X1')
    22     plt.ylabel('X2')
    23     plt.show()

      2.3局部随机梯度上升法及改进

        局部随机梯度:和全局相对,利用单个样本更新W,同时又是利用正太分布的规律去随机选择样本的次序。

        好处:‘局部’训练效率高,而且新的样本可以直接添加不用重新训练,‘随机’解决了样本规律性的波动,树上有图解。

        坏处:可能得到局部极值。

     1 #局部梯度上升法-老版本
     2 def stoGradAscent0(dataMatrix,classLabels):
     3     m,n = dataMatrix.shape
     4     alpha = 0.01
     5     weights = np.ones(n)#最好别写0,因为0的拟合速度很慢
     6     for i in range(m):
     7         h = sigmoid(sum(dataMatrix[i]*weights))
     8         error = classLabels - h
     9         weights = weights +alpha* error* dataMatrix[i]
    10     return weights
    11 #随机梯度上升法-新版本
    12 def stoGradAscent1(dataMatraix,classLabels,numIter=150):
    13     #alpha不断改变
    14     #选取的样本随机改变
    15     m,n = dataMatraix.shape
    16     weights = np.ones(n)
    17     for j in range(numIter):
    18         dataIndex = list(range(m))#样本
    19         for i in range(m):
    20             alpha = 4/(1.0+j+i) +0.01#随着迭代次数和样本的训练次数的增加而减小
    21             randIndex = int(np.random.uniform(0,len(dataIndex)))#随机样本下标
    22             h = sigmoid(sum(dataMatraix[randIndex]*weights))
    23             error = classLabels[randIndex] - h
    24             weights = weights +alpha*error*dataMatraix[randIndex]
    25             del(dataIndex[randIndex])#执行之后删除,避免重复执行
    26     return weights

      2.4实际应用

        和前面朴素贝叶斯都差不多,预处理数据-->>训练-->>测试

     1 分类函数
     2 def classifyVector(inX,weight):
     3     prob = sigmoid(sum(inX*weight))
     4     if prob>0.5: return 1.0
     5     return 0.0
     6 def colicTest():
     7     frTrain = open('horseColicTraining.txt')
     8     frtest  = open('horseColicTest.txt')
     9     trainingSet   = []
    10     trainingLabel = []
    11     for line in frTrain.readlines():
    12         currLine = line.strip().split('	')
    13         lineArr = []
    14         #最后一个是标签
    15         for i in range(len(currLine)-1):
    16             lineArr.append(float(currLine[i]))
    17         trainingSet.append(lineArr)
    18         trainingLabel.append(float(currLine[-1]))
    19     #改进之后的随机梯度下降法--->>>局部算法=在线学习
    20     trainWeight = stoGradAscent1(np.array(trainingSet),trainingLabel,500)
    21     errorCount = 0.0
    22     numTestVec = 0.0
    23     for line in frtest.readlines():
    24         numTestVec += 1.0
    25         currLine =line.strip().split('	')
    26         lineArr = []
    27         for i in range(21):
    28             lineArr.append(float(currLine[i]))
    29         if int(classifyVector(np.array(lineArr),trainWeight)) != int(currLine[21]):
    30             errorCount+=1
    31     errorRate = (1.0*errorCount)/(1.0*numTestVec)
    32     print('the error Rate is : ',errorRate,'
    ')
    33     return errorRate
    34 def multiTest():
    35     numTest = 10;errorSum = 0.0
    36     for k in range(numTest):
    37         errorSum += colicTest()
    38     print('error Rate Average is : ',(errorSum/numTest))

      2.5总程序

      1 import numpy as np
      2 import matplotlib.pyplot as plt
      3 
      4 def loadDataSet():
      5     dataMat  = []
      6     labelMat = []
      7     fr = open('testSet.txt')
      8     for line in fr.readlines():
      9         lineArr = line.strip().split()#分割空格
     10         #改变存储data:[[a,b],[c,d]]/
     11         #       labels:[1,0,0,1...]
     12         dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
     13         labelMat.append([int(lineArr[2])])
     14     return dataMat, labelMat
     15 def sigmoid(intX):
     16     return 1.0/(1.0+np.exp(-intX))
     17 #全局梯度上升法
     18 def gradAscent(dataMatIn,classLabels):
     19     dataMatrix = np.mat(dataMatIn)
     20     labelsMat = np.mat(classLabels)
     21     m, n = dataMatrix.shape
     22     alpha = 0.001
     23     maxCycle = 200
     24     weight = np.ones((n,1))#这里为了简单写,把b也当作一个w了
     25     for k in range(maxCycle):
     26         h = sigmoid(dataMatrix*weight)
     27         error = labelsMat - np.mat(h)
     28         weight = weight + alpha*dataMatrix.transpose()*error
     29     return weight
     30 
     31 def plotBestFit(weight):
     32     dataMat, labelMat = loadDataSet()
     33     dataArr = np.array(dataMat)#转化为数组
     34     n = dataArr.shape[0]
     35     xcode1=[];ycode1=[]
     36     xcode2=[];ycode2=[]
     37     for i in range(n):
     38         if int(labelMat[i][0])==1:
     39             xcode1.append(dataArr[i,1])
     40             ycode1.append(dataArr[i,2])
     41         else:
     42             xcode2.append(dataArr[i,1])
     43             ycode2.append(dataArr[i,2])
     44     fig = plt.figure("data_x_y")
     45     ax  = fig.add_subplot(111)
     46     ax.scatter(xcode1,ycode1,s=30,c='r',marker='s')
     47     ax.scatter(xcode2,ycode2,s=30,c='g')
     48     x = np.mat(np.arange(-3.0,3.0,0.1))
     49     y = (-weight[0]-weight[1]*x)/weight[2]
     50     ax.plot(x.tolist()[0],y.tolist()[0])
     51     plt.xlabel('X1')
     52     plt.ylabel('X2')
     53     plt.show()
     54 #局部梯度上升法-老版本
     55 def stoGradAscent0(dataMatrix,classLabels):
     56     m,n = dataMatrix.shape
     57     alpha = 0.01
     58     weights = np.ones(n)#最好别写0,因为0的拟合速度很慢
     59     for i in range(m):
     60         h = sigmoid(sum(dataMatrix[i]*weights))
     61         error = classLabels - h
     62         weights = weights +alpha* error* dataMatrix[i]
     63     return weights
     64 #随机梯度上升法-新版本
     65 def stoGradAscent1(dataMatraix,classLabels,numIter=150):
     66     #alpha不断改变
     67     #选取的样本随机改变
     68     m,n = dataMatraix.shape
     69     weights = np.ones(n)
     70     for j in range(numIter):
     71         dataIndex = list(range(m))#样本
     72         for i in range(m):
     73             alpha = 4/(1.0+j+i) +0.01#随着迭代次数和样本的训练次数的增加而减小
     74             randIndex = int(np.random.uniform(0,len(dataIndex)))#随机样本下标
     75             h = sigmoid(sum(dataMatraix[randIndex]*weights))
     76             error = classLabels[randIndex] - h
     77             weights = weights +alpha*error*dataMatraix[randIndex]
     78             del(dataIndex[randIndex])#执行之后删除,避免重复执行
     79     return weights
     80 #分类函数
     81 def classifyVector(inX,weight):
     82     prob = sigmoid(sum(inX*weight))
     83     if prob>0.5: return 1.0
     84     return 0.0
     85 def colicTest():
     86     frTrain = open('horseColicTraining.txt')
     87     frtest  = open('horseColicTest.txt')
     88     trainingSet   = []
     89     trainingLabel = []
     90     for line in frTrain.readlines():
     91         currLine = line.strip().split('	')
     92         lineArr = []
     93         #最后一个是标签
     94         for i in range(len(currLine)-1):
     95             lineArr.append(float(currLine[i]))
     96         trainingSet.append(lineArr)
     97         trainingLabel.append(float(currLine[-1]))
     98     #改进之后的随机梯度下降法--->>>局部算法=在线学习
     99     trainWeight = stoGradAscent1(np.array(trainingSet),trainingLabel,500)
    100     errorCount = 0.0
    101     numTestVec = 0.0
    102     for line in frtest.readlines():
    103         numTestVec += 1.0
    104         currLine =line.strip().split('	')
    105         lineArr = []
    106         for i in range(21):
    107             lineArr.append(float(currLine[i]))
    108         if int(classifyVector(np.array(lineArr),trainWeight)) != int(currLine[21]):
    109             errorCount+=1
    110     errorRate = (1.0*errorCount)/(1.0*numTestVec)
    111     print('the error Rate is : ',errorRate,'
    ')
    112     return errorRate
    113 def multiTest():
    114     numTest = 10;errorSum = 0.0
    115     for k in range(numTest):
    116         errorSum += colicTest()
    117     print('error Rate Average is : ',(errorSum/numTest))
  • 相关阅读:
    SAP Hybris使用recipe进行安装时,是如何执行ant命令的?
    实时电商数仓(三)之数据采集(二)搭建日志采集系统的集群(二)建立父工程
    实时电商数仓(一)之系统架构
    gdb 条件断点 + 多线程 +attach
    dpdk tx_pkt_burst rte_pktmbuf_free mbuf释放
    dpdk 网卡初始化 —— 收包
    dpdk 版本变动修改
    rte_mempool_get_priv
    mempool + ring test
    dpdk mempool debug
  • 原文地址:https://www.cnblogs.com/wjy-lulu/p/7967885.html
Copyright © 2011-2022 走看看