zoukankan      html  css  js  c++  java
  • 机器学习实战-学习笔记-第二章

    2.1节

    1.切换到工作目录

    2.在工作目录下新建一个python脚本文件kNN.py,内容如下:

    from numpy import *
    import operator
    
    def createDataSet():
        group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
        labels = ['A','A','B','B']
        return group, labels
    
    
    
    def classify0(inX, dataSet, labels, k):
        dataSetSize = dataSet.shape[0]
        diffMat = tile(inX, (dataSetSize, 1)) - dataSet
        sqDiffMat = diffMat**2
        sqDistances = sqDiffMat.sum(axis=1)
        distances = sqDistances**0.5
        sortedDistIndicies = distances.argsort()
        classCount={}
        for i in range(k):
            voteILabel = labels[sortedDistIndicies[i]]
            classCount[voteILabel] = classCount.get(voteILabel, 0) + 1
        sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]
        

    然后进入Python REPL:

    F:studioMachineLearningInActionch02>python
    Python 2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 16:44:52) [MSC v.1500 64 bit (AMD64)] on win32
    Type "help", "copyright", "credits" or "license" for more information.
    Anaconda is brought to you by Continuum Analytics.
    Please check out: http://continuum.io/thanks and https://binstar.org
    >>> from numpy import *
    >>> import operator
    >>> import kNN
    >>> group, labels = kNN.createDataSet()
    >>> group
    array([[ 1. ,  1.1],
           [ 1. ,  1. ],
           [ 0. ,  0. ],
           [ 0. ,  0.1]])
    >>> labels
    ['A', 'A', 'B', 'B']
    >>> kNN.classify0([0,0], group, labels, 3)
    'B'
    >>>

    2.2节

    修改kNN.py文件,增加如下内容:

    def getLabelID(labelName):
        if (labelName == "largeDoses"): return 3
        elif (labelName == "smallDoses"): return 2
        else: return 1
    
        
    def file2matrix(filename):
        fr = open(filename)
        arrayOLines = fr.readlines()
        numberOfLines = len(arrayOLines)
        returnMat = zeros((numberOfLines, 3))
        classLabelVector = []
        index = 0
        for line in arrayOLines:
            line = line.strip()
            listFromLine = line.split('	')
            returnMat[index,:] = listFromLine[0:3]
            classLabelVector.append(getLabelID(listFromLine[-1]))
            index += 1
        return returnMat,classLabelVector

    注意这里和原书的代码不一样,这里我定义了一个从labelName到labelID的函数,由于测试数据中的label是名称。

    输入如下脚本

    >>> from numpy import *
    >>> import operator
    >>> import kNN
    >>> datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
    >>> datingLabels[0:20]
    [3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
    >>> import matplotlib
    >>> import matplotlib.pyplot as plt
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
    <matplotlib.collections.PathCollection object at 0x0000000002E19278>
    >>> plt.show()

    注意最后一句plt.show(),如果没有这一句是没有显示的。

    >>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels), 15*array(datingLabels))
    <matplotlib.collections.PathCollection object at 0x00000000035A9908>
    >>> plt.show()
    >>>
    from numpy import *
    import operator
    import kNN
    datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
    import matplotlib
    import matplotlib.pyplot as plt
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15*array(datingLabels), 15*array(datingLabels))
    plt.show()

    接着对数据进行规范化:

    reload(kNN)
    normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
    normMat
    ranges
    minVals

    结果如下:

    F:studioMachineLearningInActionch02>python
    Python 2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 16:44:52) [MSC v.1500 64 bit (AMD64)] on win32
    Type "help", "copyright", "credits" or "license" for more information.
    Anaconda is brought to you by Continuum Analytics.
    Please check out: http://continuum.io/thanks and https://binstar.org
    >>> from numpy import *
    >>> import operator
    >>> import kNN
    >>> datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
    >>> normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
    >>> normMat
    array([[ 0.44832535,  0.39805139,  0.56233353],
           [ 0.15873259,  0.34195467,  0.98724416],
           [ 0.28542943,  0.06892523,  0.47449629],
           ...,
           [ 0.29115949,  0.50910294,  0.51079493],
           [ 0.52711097,  0.43665451,  0.4290048 ],
           [ 0.47940793,  0.3768091 ,  0.78571804]])
    >>> ranges
    array([  9.12730000e+04,   2.09193490e+01,   1.69436100e+00])
    >>> minVals
    array([ 0.      ,  0.      ,  0.001156])
    >>>

    测试算法:向kNN.py中增加如下内容:

    def datingClassTest():
        hoRatio = 0.10
        datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        numTestVecs = int(m*hoRatio)
        errorCount = 0.0
        for i in range(numTestVecs):
            classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
            print "the classifier came back with: %d,  the real answer is: %d" %(classifierResult, datingLabels[i])
            if (classifierResult != datingLabels[i]): 
                errorCount += 1.0
                print "ERROR: the classifier came back with: %d,  the real answer is: %d" %(classifierResult, datingLabels[i])
        print "the total error rate is: %f" %(errorCount/float(numTestVecs))
        

    在Python REPL中运行脚本

    >>> reload(kNN)
    <module 'kNN' from 'kNN.py'>
    >>> kNN.datingClassTest()
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 2
    ERROR: the classifier came back with: 1,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 1
    ERROR: the classifier came back with: 3,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 1
    ERROR: the classifier came back with: 3,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 3
    ERROR: the classifier came back with: 2,  the real answer is: 3
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 3,  the real answer is: 3
    the classifier came back with: 2,  the real answer is: 2
    the classifier came back with: 1,  the real answer is: 1
    the classifier came back with: 3,  the real answer is: 1
    ERROR: the classifier came back with: 3,  the real answer is: 1
    the total error rate is: 0.050000
  • 相关阅读:
    MySQL存储写入性能严重抖动分析
    关于MySQL的commit非规律性失败案例的深入分析
    MySQL存储写入速度慢分析
    MySQL缓存之Qcache与buffer pool对比
    SQL执行过程中的性能负载点
    关于MySQL用户会话及连接线程
    如何查询、修改参数状态值
    genymotion 前端调试
    name是个特殊的变量名吗
    background-size 导致的背景不居中问题
  • 原文地址:https://www.cnblogs.com/littlesuccess/p/5021535.html
Copyright © 2011-2022 走看看