zoukankan      html  css  js  c++  java
  • [学习笔记][Python机器学习:预测分析核心算法][多变量回归:使用交叉验证来估计套索模型的样本外错误]

      1 import numpy
      2 from sklearn import datasets, linear_model
      3 from sklearn.linear_model import LassoCV
      4 from math import sqrt
      5 import matplotlib.pyplot as plot
      6 
      7 #read data into iterable
      8 #target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
      9 #data = urllib2.urlopen(target_url)
     10 target_url_file = open('winequality-red.csv','r')
     11 data = target_url_file.readlines()
     12 target_url_file.close()
     13 
     14 xList = []
     15 labels = []
     16 names = []
     17 firstLine = True
     18 for line in data:
     19     if firstLine:
     20         names = line.strip().split(";")
     21         firstLine = False
     22     else:
     23         #split on semi-colon
     24         row = line.strip().split(";")
     25         #put labels in separate array
     26         labels.append(float(row[-1]))
     27         #remove label from row
     28         row.pop()
     29         #convert row to floats
     30         floatRow = [float(num) for num in row]
     31         xList.append(floatRow)
     32 
     33 #Normalize columns in x and labels
     34 #Note: be careful about normalization.
     35 #Some penalized regression packages include it and some don't.
     36 
     37 nrows = len(xList)
     38 ncols = len(xList[0])
     39 
     40 #calculate means and variances
     41 xMeans = []
     42 xSD = []
     43 for i in range(ncols):
     44     col = [xList[j][i] for j in range(nrows)]
     45     mean = sum(col)/nrows
     46     xMeans.append(mean)
     47     colDiff = [(xList[j][i] - mean) for j in range(nrows)]
     48     sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrows)])
     49     stdDev = sqrt(sumSq/nrows)
     50     xSD.append(stdDev)
     51 
     52 #use calculate mean and standard deviation to normalize xList
     53 xNormalized = []
     54 for i in range(nrows):
     55     rowNormalized = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
     56     xNormalized.append(rowNormalized)
     57 
     58 #Normalize labels
     59 meanLabel = sum(labels)/nrows
     60 sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
     61 
     62 labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
     63 
     64 #Convert list of list to np array for input to sklearn packages
     65 
     66 #Unnormalized labels
     67 Y = numpy.array(labels)
     68 
     69 #normalized lables
     70 Y = numpy.array(labelNormalized)
     71 
     72 #Unnormalized X's
     73 X = numpy.array(xList)
     74 
     75 #Normlized Xss
     76 X = numpy.array(xNormalized)
     77 
     78 #Call LassoCV from sklearn.linear_model
     79 #10折交叉验证
     80 wineModel = LassoCV(cv=10).fit(X, Y)
     81 
     82 # Display results
     83 
     84 
     85 plot.figure()
     86 plot.figure(figsize=(12,8))
     87 #随着alpha值的变化,均方误差的变化曲线
     88 plot.plot(wineModel.alphas_, wineModel.mse_path_, ':')
     89 #验证过程中,随着alpha值的变化,均方误差的平均曲线,并设置的alpha变化区域
     90 plot.plot(wineModel.alphas_, wineModel.mse_path_.mean(axis=-1),
     91           label='Average MSE Across Folds', linewidth=2)
     92 #最佳的alpha值,每次验证系统认为的最合适的alpha值
     93 plot.axvline(wineModel.alpha_, linestyle='dotted',label='CV Estimate of Best alpha')
     94 #这种轴半对数刻度曲线是将自变量对10取对数,可以有效的看出数据指数型变化时的衰变情况。
     95 plot.semilogx()
     96 #为图表打标注
     97 plot.legend()
     98 #当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
     99 ax = plot.gca()
    100 #x轴反向
    101 ax.invert_xaxis()
    102 
    103 plot.xlabel('alpha')
    104 plot.ylabel('Mean Square Error')
    105 plot.axis('tight')
    106 plot.show()
    107 
    108 #print out the value of alpha that minimizes the Cv-error
    109 print("alpha Value that Minimizes CV Error  ",wineModel.alpha_)
    110 print("Minimum MSE  ", min(wineModel.mse_path_.mean(axis=-1)))

    alpha Value that Minimizes CV Error   0.013561387700964642
    Minimum MSE   0.6655849206002853
  • 相关阅读:
    20145228 《信息安全系统设计基础》第十一周学习总结 (1)
    20145228 《信息安全系统设计基础》第十周学习总结 (2)
    20145228 《信息安全系统设计基础》第十周学习总结 (1)
    20145203 《信息安全系统设计基础》课程总结
    20145203 《信息安全系统设计基础》第十四周学习总结
    20145203 《信息安全系统设计基础》第十三周学习总结
    20145203盖泽双 反汇编代码实践
    20145203 《信息安全系统设计基础》第十二周学习总结
    20145203盖泽双《信息安全系统设计》 实验五 网络通信
    20145203盖泽双《信息安全系统设计》实验四 驱动程序设计
  • 原文地址:https://www.cnblogs.com/jaysonguan/p/12402826.html
Copyright © 2011-2022 走看看