zoukankan      html  css  js  c++  java
  • 001 KNN分类 最邻近算法

    1.文件
    5.0,3.5,1.6,0.6,apple
    5.1,3.8,1.9,0.4,apple
    4.8,3.0,1.4,0.3,apple
    5.1,3.8,1.6,0.2,apple
    4.6,3.2,1.4,0.2,apple
    5.3,3.7,1.5,0.2,apple
    5.0,3.3,1.4,0.2,apple
    7.0,3.2,4.7,1.4,orange
    6.4,3.2,4.5,1.5,orange
    6.9,3.1,4.9,1.5,orange
    5.5,2.3,4.0,1.3,orange
    6.5,2.8,4.6,1.5,orange
    5.7,2.8,4.5,1.3,orange
    6.3,3.3,4.7,1.6,orange
    7.3,2.9,6.3,1.8,banana
    6.7,2.5,5.8,1.8,banana
    7.2,3.6,6.1,2.5,banana
    6.5,3.2,5.1,2.0,banana
    6.4,2.7,5.3,1.9,banana
    6.8,3.0,5.5,2.1,banana
    5.7,2.5,5.0,2.0,banana
    5.8,2.8,5.1,2.4,banana

    2 代码

    # -*- coding: UTF-8 -*-
    import math
    import csv
    import random
    import operator
    
    '''
    @author:hunter
    @time:2017.03.31
    '''
    
    class KNearestNeighbor(object):
        def __init__(self):
            pass
    
        def loadDataset(self,filename, split, trainingSet, testSet):  # 加载数据集  split以某个值为界限分类train和test
            with open(filename, 'r') as csvfile:
                lines = csv.reader(csvfile)   #读取所有的行
                dataset = list(lines)     #转化成列表
                for x in range(len(dataset)-1):
                    for y in range(4):
                        dataset[x][y] = float(dataset[x][y])
                    if random.random() < split:   # 将所有数据加载到train和test中 生成0和1的随机浮点数
                        trainingSet.append(dataset[x])
                    else:
                        testSet.append(dataset[x])
    
    
        def calculateDistance(self,testdata, traindata, length):   # 计算距离
            distance = 0     # length表示维度 数据共有几维
            for x in range(length):
                distance += pow((testdata[x]-traindata[x]), 2)
            return math.sqrt(distance)
    
    
        def getNeighbors(self,trainingSet, testInstance, k):  # 返回最近的k个边距
            distances = []
            length = len(testInstance)-1
            for x in range(len(trainingSet)):   #对训练集的每一个数计算其到测试集的实际距离
                dist = self.calculateDistance(testInstance, trainingSet[x], length)
                print('训练集:{}-距离:{}'.format(trainingSet[x], dist))
                distances.append((trainingSet[x], dist))
            distances.sort(key=operator.itemgetter(1))   # 把距离从小到大排列   
            neighbors = []
            for x in range(k):   #排序完成后取前k个距离
                neighbors.append(distances[x][0])
                print(neighbors)
                return neighbors
    
    
        def getResponse(self,neighbors):  # 根据少数服从多数,决定归类到哪一类
            classVotes = {}
            for x in range(len(neighbors)):
                response = neighbors[x][-1]  # 统计每一个分类的多少
                if response in classVotes:
                    classVotes[response] += 1
                else:
                    classVotes[response] = 1   # 初始值为1
            print(classVotes.items())
            sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) #reverse按降序的方式排列
            return sortedVotes[0][0]
    
    
        def getAccuracy(self,testSet, predictions):  # 准确率计算
            correct = 0
            for x in range(len(testSet)):
                if testSet[x][-1] == predictions[x]:   #predictions是预测的和testset实际的比对
                    correct += 1
            print('共有{}个预测正确,共有{}个测试数据'.format(correct,len(testSet)))
            return (correct/float(len(testSet)))*100.0
    
    
        def Run(self):
            trainingSet = []
            testSet = []
            split = 0.75
            self.loadDataset(r'testdata.txt', split, trainingSet, testSet)   #数据划分
            print('Train set: ' + str(len(trainingSet)))
            print('Test set: ' + str(len(testSet)))
            #generate predictions
            predictions = []
            k = 3    # 取最近的3个数据
            # correct = []
            for x in range(len(testSet)):    # 对所有的测试集进行测试
                neighbors = self.getNeighbors(trainingSet, testSet[x], k)   #找到3个最近的邻居
                result = self.getResponse(neighbors)    # 找这3个邻居归类到哪一类
                predictions.append(result)
                # print('predictions: ' + repr(predictions))        返回一个它在python中的描述
                # print('>predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
            # print(correct)
            accuracy = self.getAccuracy(testSet,predictions)   
            print('Accuracy: ' + repr(accuracy) + '%')
    
    
    if __name__ == '__main__':
        a = KNearestNeighbor()
        a.Run()
    

      

  • 相关阅读:
    sed命令使用与正则表达式元字符
    centos6和centos7的网卡配置
    制作启动光盘centos6
    搭建一个私有CA
    yum命令
    while语法命令
    linux的进程与计划任务命令总结
    磁盘管理命令应用
    脚本命令与笔记
    重定向命令的使用
  • 原文地址:https://www.cnblogs.com/think-and-do/p/7084188.html
Copyright © 2011-2022 走看看