zoukankan      html  css  js  c++  java
  • K-近邻算法学习

    # -- coding: utf-8 --
    from numpy import *
    import operator
    
    def createDataSet():
        group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
        labels = ['A','A','B','B']
        return group,labels
    
    def classify0(inX,dataSet,labels,k):
        print 'inX'
        print inX
        #获取行数
        dataSetSize = dataSet.shape[0]                 
        print 'dataSetSize:'
        print dataSetSize
        
        #将用于分类的输入向量重复训练集样本的行数-训练集样本
        print 'tile(inX,(dataSetSize,1))'
        print tile(inX,(dataSetSize,1))
        
        diffMat = tile(inX,(dataSetSize,1))-dataSet     
        print 'diffMat'
        print diffMat
    
        #将差值做平方操作
        sqDiffMat = diffMat**2                          
        print 'sqDiffMat'
        print sqDiffMat
        
        #将矩阵按行相加
        sqDistances = sqDiffMat.sum(axis=1)             
        print 'sqDistances'
        print sqDistances
        #相加后开根号
        distances = sqDistances**0.5                    
        print'distances'
        print distances
        
        #按从小到大大索引排序  假如[3,1,2],排序结果为[1,2.0],结果应该是训练集的列数        
        sortedDistIndicies = distances.argsort()        
        print 'sortedDistIndicies'
        print sortedDistIndicies
        classCount = {}
        #遍历
        for i in range(k):                             
            #sortedDistIndicies[i]获取距离按照索引排序后的第i个值
            #labels[sortedDistIndicies[i]]获取距离索引对应的Label
            print 'I='+str(i)
            #获取当前索引对应的标签        
            voteIlabel = labels[sortedDistIndicies[i]]
            print 'voteIlabel='+voteIlabel
            print 'classCount.get(voteIlabel,0)='+str(classCount.get(voteIlabel,0))
            
            #对标签进行计数
            classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
        print 'classCount'    
        print classCount
        #对获取的标签通过数量进行逆序排序
        sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
        print 'sortedClassCount'
        print sortedClassCount
        return sortedClassCount[0][0]
    
    group,labels=kNN.createDataSet();
    print group
    print labels
    print kNN.classify0([0.1,0.2],group,labels,3)
    

    最终的输出结果为

    [[ 1. 1.1]
    [ 1. 1. ]
    [ 0. 0. ]
    [ 0. 0.1]]
    ['A', 'A', 'B', 'B']
    inX
    [0.1, 0.2]
    dataSetSize:
    4
    tile(inX,(dataSetSize,1))
    [[ 0.1 0.2]
    [ 0.1 0.2]
    [ 0.1 0.2]
    [ 0.1 0.2]]
    diffMat
    [[-0.9 -0.9]
    [-0.9 -0.8]
    [ 0.1 0.2]
    [ 0.1 0.1]]
    sqDiffMat
    [[ 0.81 0.81]
    [ 0.81 0.64]
    [ 0.01 0.04]
    [ 0.01 0.01]]
    sqDistances
    [ 1.62 1.45 0.05 0.02]
    distances
    [ 1.27279221 1.20415946 0.2236068 0.14142136]
    sortedDistIndicies
    [3 2 1 0]
    I=0
    voteIlabel=B
    classCount.get(voteIlabel,0)=0
    I=1
    voteIlabel=B
    classCount.get(voteIlabel,0)=1
    I=2
    voteIlabel=A
    classCount.get(voteIlabel,0)=0
    classCount
    {'A': 1, 'B': 2}
    sortedClassCount
    [('B', 2), ('A', 1)]
    B

      

  • 相关阅读:
    Django的FBV和CBV
    爬虫-----selenium模块自动爬取网页资源
    python摸爬滚打之day33----线程
    python摸爬滚打之day032 管道 数据共享 进程池
    python摸爬滚打之day030----进程
    爬虫重复请求超时
    指定页面刷新时间前端
    requests post请求,加上会话功能 以及url 编码问题
    爬虫常用mysql
    python操作excel以及word文档,pdf文档
  • 原文地址:https://www.cnblogs.com/kevin-h-wang/p/6589413.html
Copyright © 2011-2022 走看看