zoukankan      html  css  js  c++  java
  • Logistic回归python实现

    2017-08-12

    Logistic 回归,作为分类器:

    分别用了梯度上升,牛顿法来最优化损失函数:

      1 # -*- coding: utf-8 -*-
      2 
      3 '''
      4 function: 实现Logistic回归,拟合直线,对数据进行分类;
      5             利用梯度上升,随机梯度上升,改进的随机梯度上升,牛顿法分别对损失函数优化;
      6             这里没有给出最后测试分类的函数;
      7 date: 2017.8.12
      8 '''
      9 
     10 from numpy import *
     11 
     12 #从文件加载处理数据
     13 def loadDataSet():
     14     dataMat = []
     15     labelMat = []
     16     fr = open('testSet.txt')
     17     for line in fr.readlines():
     18         lineArr = line.strip().split()
     19         dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
     20         labelMat.append(int(lineArr[2]))
     21     return dataMat, labelMat
     22 
     23 #sigmoid function X: w1*x1+w2*x2+...+wn*xn
     24 def sigmoid(x):
     25     return 1 / (1 + exp(-x))
     26 
     27 #梯度上升求函数的最大值时取得的权重
     28 def gradAscent(dataMatIn, classLabels):
     29     dataMatrix = mat(dataMatIn)
     30     labelMat = mat(classLabels).transpose()   #将m维行向量转制为m维列向量
     31     m,n = shape(dataMatrix)
     32     alpha = 0.001 #设置梯度上升的步长
     33     maxCycles = 500 #最大迭代次数 
     34     weights = ones((n,1))  #weights就是theta,n维列向量,二维数组
     35     for i in range(maxCycles):
     36         h = sigmoid(dataMatrix*weights)  #计算所有数据的分类概率,h是m维向量,这里实际上进行了300次乘法运算
     37         error = labelMat - h #计算(y-h(x)):误差
     38         weights = weights + alpha * dataMatrix.transpose()*error #对所有weights同时更新
     39     print('shape of weights', shape(weights))
     40     return weights
     41 
     42 #随机上升上升
     43 def stocGradAscent0(dataMatrix, classLabels):
     44     m,n = shape(dataMatrix)
     45     alpha = 0.01
     46     weights = ones(n, float) #n 维行向量
     47     for i in range(m):
     48         h = sigmoid(sum(dataMatrix[i] * weights))
     49         error = classLabels[i] - h
     50         weights = weights + alpha * error * dataMatrix[int(i)]
     51     return weights
     52 
     53 #改进的随机上升上升
     54 def stocGradAscent1(dataMatrix, classLabels, numIter = 550):
     55     m,n = shape(dataMatrix)
     56     weights = ones(n)
     57     dataIndex = []
     58     for j in range(numIter):
     59         for k in range(m):
     60             dataIndex.append(k)
     61         for i in range(m):
     62             alpha = 4 / (i + j + 1.0) + 0.01 #aloha随着迭代次数增多不断减小但是不为0,为了解决局部波动
     63             randIndex = int(random.uniform(0, len(dataIndex))) #随机选取一个数据进行更新参数
     64             h = sigmoid(dataMatrix[randIndex] * weights)
     65             error = classLabels[randIndex] - h
     66             weights = weights + alpha * error * dataMatrix[randIndex]
     67             del(dataIndex[randIndex]) #这里会不会对本来的dataMatrix有影响,不会,因为没有操作矩阵
     68     return weights
     69 
     70 '''
     71 functuion: 牛顿法更新theta,计算海森矩阵
     72 note: 这里一定要对 XT*X 结果求逆,不然分类效果无法直视
     73 '''
     74 def computeHessianMatrix(dataMatrix):
     75     hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix))
     76     return hessianMatrix.I #矩阵求逆
     77 
     78 '''
     79 function: 牛顿法更新theta
     80 '''
     81 def newtonMethod(dataMat, labelMat, numIter = 10):
     82     m,n = shape(dataMat)
     83     dataMatrix = mat(dataMat)
     84     labelMat = mat(labelMat).transpose()
     85     weights = ones((n,1)) #参数向量
     86     hessianMatrix = computeHessianMatrix(dataMatrix)
     87     print('shape of hessian', shape(hessianMatrix))
     88     for k in range(numIter):
     89         h = sigmoid(dataMatrix * weights)
     90         error = (labelMat - h)
     91         weights = weights - (dataMatrix*hessianMatrix).transpose() * error 
     92     return weights
     93 
     94 #画出决策边界
     95 def plotBestFit(weights):
     96     import matplotlib.pyplot as plt
     97     dataMat, labelMat = loadDataSet() #或取数据和标签
     98     dataArr = array(dataMat) #将二维列表转换为数组
     99     n = shape(dataArr)[0] #行数,代表数据的个数
    100 
    101     xcord1 = []; ycord1 = []
    102     xcord2 = []; ycord2 = []
    103     for i in range(n):
    104         if int(labelMat[i]) == 1:
    105             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])
    106         else:
    107             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    108 
    109     fig = plt.figure()
    110     ax = fig.add_subplot(111)
    111     ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's')
    112     ax.scatter(xcord2, ycord2, s = 30, c = 'green')
    113     x = arange(-3.0, 3.0, 0.1)
    114     print(len(x))
    115     print(len(weights))
    116     print(shape(weights))
    117     y = (-weights[0] -  weights[1]*x) / weights[2] #把x2堪称y,x0=1,所以就是x1,x2之间的函数
    118     print(len(y))
    119     ax.plot(x,y) 
    120     plt.xlabel('X1'); plt.ylabel('X2')
    121     plt.show() #显示图像
    122 
    123 '''
    124 function: 对测试数据进行测试
    125 input: testDate 是一个向量,或者多个向量
    126 '''
    127 def logisticTest(weights, testData):
    128     m,n = shape(testData)
    129     typeLabel = []
    130     for i in range(m): 
    131         result = sigmoid(sum(testData[i] * weights)) #得到一个册数数据的概率
    132         if result > 0.5:
    133             typeLabel.append(1)
    134         else:
    135             typeLabel.append(0)
    136     return typeLabel
    137 
    138 '''
    139 function: 计算分类的正确率
    140 input: calssLables 是测试数据的类别向量
    141 '''
    142 def getCorrectRate(classLabels, testLabel):
    143     correctRate = 0.0
    144     numOfRight = 0
    145     for i in range(len(testLabel)):
    146         if classLabels[i] == testLabel[i]:
    147             numOfRight += 1
    148     correctRate = numOfRight / float(len(testLabel))
    149     return correctRate
    150 
    151 #测试算法
    152 dataMat, labelMat = loadDataSet()
    153 print('shape of datamet, ', shape(dataMat))
    154 weights1 = gradAscent(dataMat, labelMat)
    155 weights2 = stocGradAscent0(array(dataMat), labelMat)
    156 weights3 = stocGradAscent1(dataMat, labelMat)
    157 
    158 weights4 = newtonMethod(dataMat, labelMat, 2000) #牛顿法
    159 
    160 
    161 #测试正确率
    162 typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]])
    163 print(typeLabel)
    164 correctRate = getCorrectRate([1,0],typeLabel)
    165 print('正确率: ' ,str(correctRate))
    166 
    167 #画图
    168 #plotBestFit(array(weights1)) #这里因为普通梯度上升返回的是一个矩阵,所以要转换为向量
    169 #plotBestFit(weights2)
    170 #plotBestFit(weights3)
    171 plotBestFit(array(weights4))

    # -*- coding: utf-8 -*-
    '''function: 实现Logistic回归,拟合直线,对数据进行分类;利用梯度上升,随机梯度上升,改进的随机梯度上升,牛顿法分别对损失函数优化;这里没有给出最后测试分类的函数;date: 2017.8.12'''
    from numpy import *
    #从文件加载处理数据def loadDataSet():dataMat = []labelMat = []fr = open('testSet.txt')for line in fr.readlines():lineArr = line.strip().split()dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])labelMat.append(int(lineArr[2]))return dataMat, labelMat
    #sigmoid function X: w1*x1+w2*x2+...+wn*xndef sigmoid(x):return 1 / (1 + exp(-x))
    #梯度上升求函数的最大值时取得的权重def gradAscent(dataMatIn, classLabels):dataMatrix = mat(dataMatIn)labelMat = mat(classLabels).transpose()   #将m维行向量转制为m维列向量m,n = shape(dataMatrix)alpha = 0.001 #设置梯度上升的步长maxCycles = 500 #最大迭代次数 weights = ones((n,1))  #weights就是theta,n维列向量,二维数组for i in range(maxCycles):h = sigmoid(dataMatrix*weights)  #计算所有数据的分类概率,h是m维向量,这里实际上进行了300次乘法运算error = labelMat - h #计算(y-h(x)):误差weights = weights + alpha * dataMatrix.transpose()*error #对所有weights同时更新print('shape of weights', shape(weights))return weights
    #随机上升上升def stocGradAscent0(dataMatrix, classLabels):m,n = shape(dataMatrix)alpha = 0.01weights = ones(n, float) #n 维行向量for i in range(m):h = sigmoid(sum(dataMatrix[i] * weights))error = classLabels[i] - hweights = weights + alpha * error * dataMatrix[int(i)]return weights
    #改进的随机上升上升def stocGradAscent1(dataMatrix, classLabels, numIter = 550):m,n = shape(dataMatrix)weights = ones(n)dataIndex = []for j in range(numIter):for k in range(m):dataIndex.append(k)for i in range(m):alpha = 4 / (i + j + 1.0) + 0.01 #aloha随着迭代次数增多不断减小但是不为0,为了解决局部波动randIndex = int(random.uniform(0, len(dataIndex))) #随机选取一个数据进行更新参数h = sigmoid(dataMatrix[randIndex] * weights)error = classLabels[randIndex] - hweights = weights + alpha * error * dataMatrix[randIndex]del(dataIndex[randIndex]) #这里会不会对本来的dataMatrix有影响,不会,因为没有操作矩阵return weights
    '''functuion: 牛顿法更新theta,计算海森矩阵note: 这里一定要对 XT*X 结果求逆,不然分类效果无法直视'''def computeHessianMatrix(dataMatrix):hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix))return hessianMatrix.I #矩阵求逆
    '''function: 牛顿法更新theta'''def newtonMethod(dataMat, labelMat, numIter = 10):m,n = shape(dataMat)dataMatrix = mat(dataMat)labelMat = mat(labelMat).transpose()weights = ones((n,1)) #参数向量hessianMatrix = computeHessianMatrix(dataMatrix)print('shape of hessian', shape(hessianMatrix))for k in range(numIter):h = sigmoid(dataMatrix * weights)error = (labelMat - h)weights = weights - (dataMatrix*hessianMatrix).transpose() * error return weights
    #画出决策边界def plotBestFit(weights):import matplotlib.pyplot as pltdataMat, labelMat = loadDataSet() #或取数据和标签dataArr = array(dataMat) #将二维列表转换为数组n = shape(dataArr)[0] #行数,代表数据的个数
    xcord1 = []; ycord1 = []xcord2 = []; ycord2 = []for i in range(n):if int(labelMat[i]) == 1:xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])else:xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
    fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's')ax.scatter(xcord2, ycord2, s = 30, c = 'green')x = arange(-3.0, 3.0, 0.1)print(len(x))print(len(weights))print(shape(weights))y = (-weights[0] -  weights[1]*x) / weights[2] #把x2堪称y,x0=1,所以就是x1,x2之间的函数print(len(y))ax.plot(x,y) plt.xlabel('X1'); plt.ylabel('X2')plt.show() #显示图像
    '''function: 对测试数据进行测试input: testDate 是一个向量,或者多个向量'''def logisticTest(weights, testData):m,n = shape(testData)typeLabel = []for i in range(m): result = sigmoid(sum(testData[i] * weights)) #得到一个册数数据的概率if result > 0.5:typeLabel.append(1)else:typeLabel.append(0)return typeLabel
    '''function: 计算分类的正确率input: calssLables 是测试数据的类别向量'''def getCorrectRate(classLabels, testLabel):correctRate = 0.0numOfRight = 0for i in range(len(testLabel)):if classLabels[i] == testLabel[i]:numOfRight += 1correctRate = numOfRight / float(len(testLabel))return correctRate
    #测试算法dataMat, labelMat = loadDataSet()print('shape of datamet, ', shape(dataMat))weights1 = gradAscent(dataMat, labelMat)weights2 = stocGradAscent0(array(dataMat), labelMat)weights3 = stocGradAscent1(dataMat, labelMat)
    weights4 = newtonMethod(dataMat, labelMat, 2000) #牛顿法

    #测试正确率typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]])print(typeLabel)correctRate = getCorrectRate([1,0],typeLabel)print('正确率: ' ,str(correctRate))
    #画图#plotBestFit(array(weights1)) #这里因为普通梯度上升返回的是一个矩阵,所以要转换为向量#plotBestFit(weights2)#plotBestFit(weights3)plotBestFit(array(weights4))

  • 相关阅读:
    数据加密标准(DES)详解(附源码)
    《ProGit》阅读笔记
    版本管理——Git和SVN的介绍及其优缺点
    jQuery-动画
    ajax与jsonp中的几个封装函数
    关于ajax
    js基础——数组的概念及其方法
    js实现放烟花效果——点击处会从下向上升起烟花
    js实现——鼠标移动时跟随着一连的小图片
    BFC的作用及其应用
  • 原文地址:https://www.cnblogs.com/robin2ML/p/7351712.html
Copyright © 2011-2022 走看看