def gradAscent(dataMatIn, classLabels): dataMatrix = mat(dataMatIn) #convert to NumPy matrix labelMat = mat(classLabels).transpose() #convert to NumPy matrix m,n = shape(dataMatrix) alpha = 0.001 maxCycles = 500 weights = ones((n,1)) for k in range(maxCycles): #heavy on matrix operations h = sigmoid(dataMatrix*weights) #matrix mult error = (labelMat - h) #vector subtraction weights = weights + alpha * dataMatrix.transpose()* error #matrix mult return weights
这是书中梯度上升算法的代码,但看到倒数第三行和倒数第二行的时候就懵逼了,书中说这里略去了一个简单的数据推导,嘤嘤嘤,想了一会没想出来,于是乎就百度看看大神的解释,这是找到的一篇解释的比较好的,仔细一看发现是在Ag的机器学习视频中讲过的,忘了。。。
Sigmoid函数: $g(z)=frac{1}{1+e^{-z}}$
$h_{ heta }(x)=g( heta ^{T}x)$
这里$ heta$和书中w一样,表示系数
代价函数如下,代价函数是用来计算预测值(类别)与实际值(类别)之间的误差的
$cost(h_{ heta} (x),y)=left{egin{matrix}
-log(h_{ heta}(x)), y=1\
-log(1-h_{ heta}(x)), y=0end{matrix}
ight.$
写在一起表示为:
$cost(h_{ heta} (x),y)=-ylog(h_{ heta}(x))-(1-y)log(1-h_{ heta}(x))$
总体的代价函数为:
$J( heta)=frac{1}{m}sum_{m}^{i=1}cost(h_{ heta} (x^{(i)}),y^{(i)})=frac{1}{m}sum_{m}^{i=1}-y^{(i)}log(h_{ heta}(x^{(i)}))-(1-y^{(i)})log(1-h_{ heta}(x^{(i)}))$
要使误差最小,即求$J( heta)$最小,也可以转化成就$-J( heta)$的最大值,可以用梯度上升算法来求最大值,
$ heta := heta+ alpha frac{partial J( heta )}{partial heta_{j}}$
下面是推导过程:
第三步到第四步:
所以权重的迭代更新公式为:
$ heta_{j} = heta_{j}+ alpha sum_{m}^{i=1}(y_{i}-h_{ heta}(x^{(i)}))x^{(i))}$
gradAscent是最原始的梯度上升算法,每次更新系数都需要计算整个数据集,计算量较大
stocGradAscent0是随机梯度上升算法,每次只使用一个样本点来更新系数
stocGradAscent1是改进后的随机梯度上升算法,1.增加了迭代次数,2.同时随机选取样本点对系数进行更新,减少了周期性波动 3.alpha每次迭代时都会进行调整,缓解数据波动和高频振动(这里不是太懂)
完整的程序代码
import random import numpy as np import matplotlib.pyplot as plt def loadDataSet(): dataMat = [] labelMat = [] fr = open('testSet.txt') for line in fr.readlines(): lineArr = line.strip().split() dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #分别是x0,x1,x2 labelMat.append(int(lineArr[2])) #标签 return dataMat, labelMat def sigmoid(inX): return 1.0 / (1 + np.exp(-inX)) def gradAscent(dataMatIn, classLabels): dataMatrix = np.mat(dataMatIn) #100 * 3 array转换成matrix labelMat = np.mat(classLabels).transpose() #100 * 3 m, n = np.shape(dataMatrix) #100, 3 alpha = 0.001 maxCycles = 500 weights = np.ones((n,1)) #3 * 1 for k in range(maxCycles): h = sigmoid(dataMatrix * weights) #100 * 1 error = labelMat - h weights = weights + alpha * dataMatrix.transpose() * error #3 * 1 return weights """画出决策边界""" def plotBestFit(weights): dataMat, labelMat = loadDataSet() dataArr = np.array(dataMat) #numpy的matrix类型转化成array n = np.shape(dataArr)[0] #样本数100 xcord1 = [];ycord1 = [] #类别为1的样本的x,y坐标 xcord2 = [];ycord2 = [] #类别为0的样本的x,y坐标 for i in range(n): if int(labelMat[i]) == 1: xcord1.append(dataArr[i, 1]); ycord1.append(dataArr[i, 2]) else: xcord2.append(dataArr[i, 1]); ycord2.append(dataArr[i, 2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') #s参数是数据点的粗细,marker是标记('s'是正方形,默认圆形) ax.scatter(xcord2, ycord2, s=30, c='green') x = np.arange(-3.0, 3.0, 0.1) #直线x坐标的取值范围 y = (-weights[0] - weights[1] * x) / weights[2] #直线方程 ax.plot(x, y) plt.xlabel('X1'); plt.ylabel('X2'); plt.show() """画出系数变化图""" def plotWeight(weight0, weight1, weight2): x = np.arange(len(weight0)) fig = plt.figure() ax = fig.add_subplot(311) # 等价于ax = plt.subplot(311) print(weight0) ax.plot(x, weight0) ax = fig.add_subplot(312) ax.plot(x, weight1) ax = fig.add_subplot(313) ax.plot(x, weight2) plt.show() """随机梯度上升算法""" def stocGradAscent0(dataArr, classbels, numIter): #书上的命名容易误导人,改一下 m, n = np.shape(dataArr) alpha = 0.01 weights = np.ones(n) w0 = []; w1 = []; w2 = [] for j in range(numIter): for i in range(m): h = sigmoid(sum(dataArr[i] * weights)) #数值 error = classbels[i] - h #数值 weights = weights + alpha * error * dataArr[i] w0.append(weights[0]) w1.append(weights[1]) w2.append(weights[2]) return weights, w0, w1, w2 """改进的随机梯度上升法""" def stocGradAscent1(dataArr, classbels, numIter=150): m, n = np.shape(dataArr) weights = np.ones(n) w0 = []; w1 = []; w2 = [] for j in range(numIter): dataIndex = range(m) for i in range(m): alpha = 4 / (1.0 + j + i) + 0.01 randIndex = int(random.uniform(0, len(dataIndex))) #通过随机选取样本,来减少周期性的波动 h = sigmoid(sum(dataArr[randIndex] * weights)) error = classbels[randIndex] - h #数值 weights = weights + alpha * error * dataArr[randIndex] # w0.append(weights[0]) # w1.append(weights[1]) # w2.append(weights[2]) # return weights, w0, w1, w2 return weights dataMat, labelMat = loadDataSet() weights = gradAscent(dataMat, labelMat) plotBestFit(weights.getA()) #getA()函数将Numpy.matrix型转为ndarray型 dataList, labelList = loadDataSet() weights,w0,w1,w2 = stocGradAscent0(np.array(dataList), labelList, 200) plotBestFit(weights) plotWeight(w0, w1,w2) dataList, labelList = loadDataSet() weights,w0,w1,w2 = stocGradAscent1(np.array(dataList), labelList) plotBestFit(weights) plotWeight(w0, w1,w2) """分类函数""" def classifyVector(inX, weights): prob = sigmoid(sum(inX * weights)) if prob > 0.5: return 1.0 else: return 0.0 def colicTest(): frTrain = open("horseColicTraining.txt") frTest = open("horseColicTest.txt") trainingSet = []; trainingLabels = [] for line in frTrain.readlines(): currLine = line.strip().split(' ') lineArr = [] for i in range(len(currLine)-1): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) trainingLabels.append(float(currLine[len(currLine) - 1])) trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) errorCount = 0; numTestVec = 1 for line in frTest.readlines(): numTestVec += 1 currLine = line.strip().split(' ') lineArr = [] for i in range(len(currLine)-1): lineArr.append(float(currLine[i])) if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[len(currLine)-1]): errorCount += 1 errorRate = float(errorCount) / float(numTestVec) print("the error rate of this test is: %f" % errorRate) return errorRate """计算10次的平均错误率""" def multiTest(): numTests = 10; errorSum = 0.0 for k in range(numTests): errorSum += colicTest() print("after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))) multiTest()
梯度上升算法,迭代500次效果图
随机梯度上升算法,迭代200次效果图
随机梯度上升算法,迭代200次回归系数变化图
书上是这样的,不知道为什么不一样
改进后的随机梯度上升算法的系数变化图,收敛的更快了
改进后的随机梯度上升算法效果图