zoukankan      html  css  js  c++  java
  • 《统计学习方法》第八章,提升方法2

    ▶ 回归问题的提升树,采用分段常数函数进行回归,每次选择一个分界点及相应的两侧估值,最小化训练数据在两侧的残差平方和的和,然后将当前残差作为下一次训练的数据,最后把各次得到的分段函数加在一起,就是最终回归结果

    ● 代码

     1 import numpy as np
     2 import matplotlib.pyplot as plt
     3 from matplotlib.patches import Rectangle
     4 
     5 trainDataRatio = 0.3
     6 dataSize = 1000
     7 defaultEpsilon = 0.05                                                           # 需要进行分段的最小区间宽度
     8 randomSeed = 103
     9 
    10 def myColor(x):                                                                 # 颜色函数
    11     r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0])
    12     g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0])
    13     b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0])
    14     return [r**2,g**2,b**2]
    15 
    16 def dataSplit(dataX, dataY, part):                                              # 将数据集分割为训练集和测试集
    17     return dataX[:part],dataY[:part], dataX[part:], dataY[part:]
    18 
    19 def judgeStrong(x, para):                                                       # 强分类判别函数,调用弱分类判别函数进行线性加和
    20     return para[1][targetIndex(x, para[0])]    
    21 
    22 def targetIndex(x, xList):                                                      # 二分查找 xList 中大于 x 的最小索引
    23     lp = 0
    24     rp = len(xList) - 1    
    25     while lp < rp - 1:        
    26         mp = (lp + rp) >> 1
    27         if(xList[mp] >= x):
    28             rp = mp
    29         else:
    30             lp = mp
    31     return rp
    32 
    33 def createData(count = dataSize):                                               # 创建数据
    34     np.random.seed(randomSeed)       
    35     X = np.random.rand(count)    
    36     Y = X * (32 / 3 * (X-1) * (X-1/2) + 1)
    37     return X, Y
    38     
    39 def adaBoost(dataX, dataY, weakCount):                                          # 提升训练
    40     count = len(dataX)
    41     xSort, ySort = zip( *sorted(zip(dataX, dataY)) )
    42     xSort = np.array(xSort)
    43     ySort = np.array(ySort)    
    44 
    45     result = np.zeros([weakCount,3])
    46     for i in range(weakCount):
    47         table = np.zeros(count - 1)
    48         for j in range(count - 1):                                              # 找最佳分割点,损失函数为两侧残差平方和的和
    49             #table[j] = np.sum( (ySort[:j+1] - np.mean(ySort[:j+1]))**2 ) + np.sum( (ySort[j+1:] - np.mean(ySort[j+1:]))**2 )    
    50             table[j] = -np.sum(ySort[:j+1])**2 / (j+1) - np.sum(ySort[j+1:])**2 / (count - j - 1)    # 简化一点计算量        
    51         index = np.argmin(table)                    
    52         
    53         valueLeft = np.mean(ySort[:index+1])                                    # 两侧估值
    54         valueRight = np.mean(ySort[index+1:])                                
    55         result[i] = np.array([ xSort[index], valueLeft, valueRight ])           # 当前结果暂存,表示当前分类器以 xSort[index] 为分点,左右侧取值分别为 valueLeft 和 valueRight
    56         ySort[:index+1] -= valueLeft                                            # 两侧新残差
    57         ySort[index+1:] -= valueRight        
    58     
    59     result = np.array(sorted(result, key=lambda x:x[0]))
    60     #L = np.triu( np.tile(result[:,1], [weakCount+1,1]), 0 )                         # 分段结果加和为最终分类器,注意段数为 weakCount + 1 
    61     #R = np.tril( np.tile(result[:,2], [weakCount+1,1]), -1)                         # 这里第 i 行 等于 r[0] + r[1] + ... + r[i-1] + l[i] + l[i+1] + ... +l[n],表示第 i 个分点
    62     #return np.concatenate(( result[:,0], np.array([np.inf]) )), np.sum(L + R, 1)    # 其中第 0 行是 np.sum(result[:,1]),第 weakCount 行是 np.sum(result[:,2])
    63     L = np.concatenate((                  np.cumsum(result[:,1].T[::-1])[::-1], np.array([0.0]) )) # 简化一点计算量和空间开销
    64     R = np.concatenate(( np.array([0.0]), np.cumsum(result[:,2].T) ))
    65     return np.concatenate(( result[:,0], np.array([np.inf]) )), L + R
    66 
    67 def test(weakCount):                                                            # 单次测试
    68     allX, allY = createData()
    69     trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainDataRatio))
    70     
    71     para = adaBoost(trainX, trainY, weakCount)        
    72 
    73     testResult = [ judgeStrong(i, para) for i in testX ]
    74     errorRatio = np.sum( np.abs(np.array(testResult) -  testY) ) / (dataSize*(1-trainDataRatio))
    75     print( "weakCount = %d, errorRatio = %f"%(weakCount, round(errorRatio,4)) )         
    76 
    77     fig = plt.figure(figsize=(10, 8))                                           # 画图
    78     plt.xlim(-0.1, 1.1)
    79     plt.ylim(-0.1, 1.1)
    80     XX = [0.0] + para[0][:-1].flatten().tolist() + [1.0]
    81     YY = [ judgeStrong(i, para) for i in XX ]    
    82     plt.scatter(testX, testY, color = myColor(1), s = 8, label = "originalData")
    83     for i in range(weakCount+1):
    84         plt.plot(XX[i:i+2], [YY[i+1], YY[i+1]],color = myColor(0), label = "regressionData")        
    85     plt.text(0.1, 1.0, "weakCount = " + str(weakCount) + "
    errorRatio = " + str(round(errorRatio,4)), size = 15, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))       
    86     R = [ Rectangle((0,0),0,0, color = myColor(1 - i)) for i in range(2) ]
    87     plt.legend(R, ["originalData", "regressionData"], loc=[0.79, 0.012], ncol=1, numpoints=1, framealpha = 1)
    88 
    89     fig.savefig("R:\weakCount" + str(weakCount) + ".png")
    90     plt.close()   
    91 
    92 if __name__ == '__main__':
    93     test(1)
    94     test(2)
    95     test(3)
    96     test(4)           
    97     test(20)            
    98     test(100)            

    ● 输出结果,包含训练轮数和预测错误率

    weakCount = 1, errorRatio = 0.184200
    weakCount = 2, errorRatio = 0.142200
    weakCount = 3, errorRatio = 0.115000
    weakCount = 4, errorRatio = 0.102500
    weakCount = 20, errorRatio = 0.035300
    weakCount = 100, errorRatio = 0.016000

    ● 画图,总数据量 1000,训练集 300,测试集 700

    ● 画图,当总数据量减为 500,训练集 150 时,出现了过拟合?

  • 相关阅读:
    Firefly 3288又一次制作android和lubuntu双系统固件
    想做一个完美的健身训练计划,你须要知道什么?
    【LeetCode-面试算法经典-Java实现】【075-Sort Colors (颜色排序)】
    每天进步一点点——Ganglia的Python扩展模块开发
    Unity3D-rigidBody.velocity
    泛型初识
    HDOJ 5418 Victor and World 状压DP
    UIPopoverController具体解释
    怎样提升站点的性能?
    PHP操作MongoDB数据库具体样例介绍(增、删、改、查) (六)
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/11302771.html
Copyright © 2011-2022 走看看