zoukankan      html  css  js  c++  java
  • knn 分类Helen数据集

    • knn流程
    1. 数据读取
    2. 数据归一化
    3. knn实现
    • 数据分析

    以数据的前两项特征对数据进行划分得到以下散点图:

    • 具体实现
    import numpy as np
    def read_data(path):
        lines=path.readlines()
        data=[]
        label=[]
        for line in lines:
            line=line.split()
            data.append(list(map(float,line[0:3])))
            if line[-1]=='didntLike':
                label.append(3)
            elif line[-1]=='largeDoses':
                label.append(2)
            else:
                label.append(1)
        return np.array(data),np.array(label)
    
    def normalized(data):
        mindata=data.min(0)
    
    def standdata(traindata):
        meandata0 = np.mean(traindata,axis=0)
        stddata0 = np.std(traindata,axis=0)
        length = traindata.shape[0]
        meandata1 = np.tile(meandata0,(length,1))
        stddata1 = np.tile(stddata0,(length,1))
        standdata = (traindata-meandata1)/stddata1
        return standdata, meandata0, stddata0
    def autoNorm(x):
        """
        最大值最小值归一化
        :param x: 需要归一化的特征向量
        :return: 新的数组、极差、最小值
        """
    
        minVals=x.min(axis=0)
        maxVals=x.max(axis=0)
        ranges=maxVals-minVals
    
        x_new=(x-minVals)/ranges # 广播
    
        return x_new,ranges,minVals
    
    def knn(traindata,testdata,label,k):
        distance=np.sqrt(np.sum((traindata-testdata)**2,axis=1))
        p=distance.argsort()
        vote = [0, 0, 0]
        for i in range(k):
            vote[label[p[i]]-1]=vote[label[p[i]]-1]+1
        return vote.index(max(vote))+1
    
    def testknn(data,label,k):
        # per=np.random.permutation(np.shape(data)[0])
        # new_data=data[per,:]
        # new_label=label[per]
        train_data=data[0:int(np.shape(data)[0]*0.9)]
        train_label=label[0:int(np.shape(data)[0]*0.9)]
        test_data=data[int(np.shape(data)[0]*0.9):]
        test_label = label[int(np.shape(data)[0] * 0.9):]
        true_label=0
        for i in range(len(test_label)):
            result_a=knn(train_data,test_data[i],train_label,k)
            if result_a==test_label[i]:
                true_label=true_label+1
            print(result_a,test_label[i])
        acc=float(true_label)/len(test_data)
        return acc
    
    if __name__ == '__main__':
        path='Knn_Helen'
        true_label=["smallDoses",'largeDoses','didntLike']
        file=open(path,'r')
        print('=======')
        data,label=read_data(file)
        a,b,c=autoNorm(data)
        acc=testknn(a,label,25)
        print(acc)
    

    准确率能达到95%以上

  • 相关阅读:
    Ubunut16.04 安装 Theano+GPU
    ubuntu源与常用python配置pip源(win)、pip常用命令
    集群(heartbeat)搭建
    Linux下搭建企业共享目录方案之------samba
    LAMP的安装和注意事项
    Linux最小化安装,忘记安装开发工具的解决方法
    去掉Linux尖锐的提示音
    最小化安装CentOS7,没有ifconfig命令---yum search command_name搜索未知包名
    编译安装php-5.4.44
    configure: error: Please reinstall the libcurl distribution
  • 原文地址:https://www.cnblogs.com/peng-yuan/p/14703275.html
Copyright © 2011-2022 走看看