zoukankan      html  css  js  c++  java
  • python实现knn

    邻近算法,或者说K最近邻(kNN,k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一。所谓K最近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个邻居来代表。
    kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别,则该样本也属于这个类别,并具有这个类别上样本的特性。
    概念很简单,更多的解释可以参考百度百科,有图有示例,讲的非常清楚。
    接下来看看怎么用python实现KNN,代码中都是详细的注释:
     
    首先是对载入数据的部分函数,这里主要看看CIFIA10的数据格式就知道代码的意思了
     1 from __future__ import print_function
     2 
     3 from six.moves import cPickle as pickle
     4 import numpy as np
     5 import os
     6 from scipy.misc import imread
     7 import platform
     8 
     9 def load_pickle(f):
    10     version = platform.python_version_tuple()
    11     if version[0] == '2':
    12         return  pickle.load(f)
    13     elif version[0] == '3':
    14         return  pickle.load(f, encoding='latin1')
    15     raise ValueError("invalid python version: {}".format(version))
    16 
    17 def load_CIFAR_batch(filename):
    18   """ CIRAR的数据是分批的,这个函数的功能是载入一批数据 """
    19   with open(filename, 'rb') as f:
    20     datadict = load_pickle(f) #以二进制方式打开文件
    21     X = datadict['data']
    22     Y = datadict['labels']
    23     X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    24     Y = np.array(Y)
    25     return X, Y
    26 
    27 def load_CIFAR10(ROOT):
    28   """ load 所有的数据 """
    29   xs = []
    30   ys = []
    31   for b in range(1,6):
    32     f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
    33     X, Y = load_CIFAR_batch(f)
    34     xs.append(X)
    35     ys.append(Y)    
    36   Xtr = np.concatenate(xs)
    37   Ytr = np.concatenate(ys)
    38   del X, Y
    39   Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
    40   return Xtr, Ytr, Xte, Yte

    然后是KNN类,定义了KNN的距离的计算方式、训练和预测函数:

      1 import numpy as np
      2 
      3 class KNearestNeighbor(object):
      4   """ 
      5   kNN 分类器 
      6   这里度量两张图片之间的距离就直接简单的采用L2距离
      7   实际上要达到比较好的效果需要设计更好的距离距离方式 
      8   """
      9 
     10   def __init__(self):
     11     pass
     12 
     13   def train(self, X, y):
     14     """
     15     训练过程基本上没有什么操作,只是简单的记录下所有的数据
     16 
     17     Inputs:
     18     - X(N, D) N个输入图片,每张图片表示为D位向量 
     19     - y(N,) 标签
     20     """
     21     self.X_train = X
     22     self.y_train = y
     23     
     24   def predict(self, X, k=1, num_loops=0):
     25     """
     26     对于新的输入,给出预测分类
     27 
     28     Inputs:
     29     - X(num_test, D) 
     30     - k: 选择用来决定输出的最相近邻居的个数
     31     - num_loops:这里实现了3种方式来实现L2距离的计算,比较一下计算速度,
     32                 都是利用了numpy的broadcast机制。
     33                 可以看到使用numpy内置的方式计算速度远远高于自己写的循环
     34 
     35     Returns:
     36     - y(num_test,):预测的分类下标 
     37  
     38     """
     39     if num_loops == 0:
     40       dists = self.compute_distances_no_loops(X)
     41     elif num_loops == 1:
     42       dists = self.compute_distances_one_loop(X)
     43     elif num_loops == 2:
     44       dists = self.compute_distances_two_loops(X)
     45     else:
     46       raise ValueError('Invalid value %d for num_loops' % num_loops)
     47 
     48     return self.predict_labels(dists, k=k)
     49 
     50   def compute_distances_two_loops(self, X):
     51     """
     52     Inputs:
     53     - X(num_test, D):test data.
     54 
     55     Returns:
     56     - dists(num_test, num_train):dists[i, j]表示测试数据i和训练数据j之间的L2距离
     57     """
     58 
     59     num_test = X.shape[0]
     60     num_train = self.X_train.shape[0]
     61     dists = np.zeros((num_test, num_train))
     62     for i in range(num_test):
     63       for j in range(num_train):
     64         dists[i,j]=np.sqrt(np.sum(np.square(X[i]-self.X_train[j])))
     65     return dists
     66 
     67   def compute_distances_one_loop(self, X):
     68     num_test = X.shape[0]
     69     num_train = self.X_train.shape[0]
     70     dists = np.zeros((num_test, num_train))
     71     for i in range(num_test):
     72       dists[i,:]=np.sqrt(np.sum(np.square(X[i]-self.X_train),axis=1))
     73     return dists
     74 
     75   def compute_distances_no_loops(self, X):
     76  
     77     num_test = X.shape[0]
     78     num_train = self.X_train.shape[0]
     79     dists = np.zeros((num_test, num_train)) 
     80     #这里需要使用一点矩阵和广播的小技巧,具体的看下面的操作自己体会
     81     dists+=(np.sum(np.square(X),axis=1)).reshape(-1,1)
     82     dists+=(np.sum(np.square(self.X_train),axis=1)).reshape(1,-1)
     83     dists-=2*np.dot(X,self.X_train.T)
     84     dists=np.sqrt(dists)
     85     
     86     return dists
     87 
     88   def predict_labels(self, dists, k=1):
     89     """
     90     给出测试图片和训练图片的距离矩阵,为每个测试图片分类
     91 
     92     Inputs:
     93     - dists(num_test, num_train) 
     94 
     95     Returns:
     96     - y: (num_test,)   
     97     """
     98 
     99     num_test = dists.shape[0]
    100     y_pred = np.zeros(num_test)
    101     for i in range(num_test):
    102       # 长度为k的list保存第i张测试图片距离最近的训练数据的下标
    103       closest_y = []
    104       closest_y=self.y_train[np.argsort(dists[i])[:k]]
    105       y_pred[i]=np.argmax(np.bincount(closest_y))
    106     return y_pred

    最后是主函数部分,载入数据,调用KNN类的实例去训练和预测。并使用k折交叉验证去选择合适的超参数k:

      1 # coding: utf-8
      2 
      3 # KNN
      4 # KNN分类器主要分为两个步骤:
      5 # - 训练阶段, 简单的记忆所有的输入数据(存储)
      6 # - 预测阶段, 对与每一个输入,在所有的存储数据中选择k个与输入最接近的
      7 # - k是超参数
      8 # 
      9 
     10 
     11 import random
     12 import numpy as np
     13 from cs231n.data_utils import load_CIFAR10
     14 import matplotlib.pyplot as plt
     15 
     16 
     17 #get_ipython().run_line_magic('matplotlib', 'inline')
     18 plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
     19 plt.rcParams['image.interpolation'] = 'nearest'
     20 plt.rcParams['image.cmap'] = 'gray'
     21 
     22 # Load CIFAR-10 的数据.
     23 cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
     24 X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
     25 
     26 # 通过输出数据的维度检查数据加载是否正确
     27 print('Training data shape: ', X_train.shape)
     28 print('Training labels shape: ', y_train.shape)
     29 print('Test data shape: ', X_test.shape)
     30 print('Test labels shape: ', y_test.shape)
     31 
     32 
     33 # 可视化一些数据集中的样例.
     34 classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
     35 num_classes = len(classes)
     36 samples_per_class = 7
     37 for y, cls in enumerate(classes):
     38     idxs = np.flatnonzero(y_train == y)   #得到每一类对应图片的下标
     39     idxs = np.random.choice(idxs, samples_per_class, replace=False) #在该类的所有图片中随机选择
     40     for i, idx in enumerate(idxs):
     41         plt_idx = i * num_classes + y + 1
     42         plt.subplot(samples_per_class, num_classes, plt_idx)
     43         plt.imshow(X_train[idx].astype('uint8'))
     44         plt.axis('off')
     45         if i == 0:
     46             plt.title(cls)
     47 plt.show()
     48 
     49 
     50 # 采样,不使用全部数据,训练的更快一点,先来看看效果
     51 # 程序全部跑通之后可以优化一下方式,使用全部数据来试试效果
     52 num_training = 5000
     53 mask = list(range(num_training))
     54 X_train = X_train[mask]
     55 y_train = y_train[mask]
     56 
     57 num_test = 500
     58 mask = list(range(num_test))
     59 X_test = X_test[mask]
     60 y_test = y_test[mask]
     61 
     62 
     63 # 把图片Reshape到一维 
     64 X_train = np.reshape(X_train, (X_train.shape[0], -1))
     65 X_test = np.reshape(X_test, (X_test.shape[0], -1))
     66 print(X_train.shape, X_test.shape)
     67 
     68 
     69 from cs231n.classifiers import KNearestNeighbor
     70 
     71 classifier = KNearestNeighbor()
     72 classifier.train(X_train, y_train)
     73 
     74 dists = classifier.compute_distances_two_loops(X_test)
     75 print(dists.shape)
     76 
     77 # 可视化距离矩阵,每一行代表一张输入图片到所有训练数据的距离
     78 plt.imshow(dists, interpolation='none')
     79 plt.show()
     80 
     81 
     82 
     83 y_test_pred = classifier.predict_labels(dists, k=1)
     84 num_correct = np.sum(y_test_pred == y_test)
     85 accuracy = float(num_correct) / num_test
     86 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
     87 
     88 
     89 
     90 y_test_pred = classifier.predict_labels(dists, k=5)
     91 num_correct = np.sum(y_test_pred == y_test)
     92 accuracy = float(num_correct) / num_test
     93 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
     94 
     95 
     96 
     97 
     98 dists_one = classifier.compute_distances_one_loop(X_test)
     99 
    100 # 验证2种实现方式得到的距离矩阵是否等价
    101 difference = np.linalg.norm(dists - dists_one, ord='fro')
    102 print('Difference was: %f' % (difference, ))
    103 if difference < 0.001:
    104     print('Good! The distance matrices are the same')
    105 else:
    106     print('Uh-oh! The distance matrices are different')
    107 
    108 
    109 
    110 dists_two = classifier.compute_distances_no_loops(X_test)
    111 difference = np.linalg.norm(dists - dists_two, ord='fro')
    112 print('Difference was: %f' % (difference, ))
    113 if difference < 0.001:
    114     print('Good! The distance matrices are the same')
    115 else:
    116     print('Uh-oh! The distance matrices are different')
    117 
    118 
    119 
    120 def time_function(f, *args):
    121     """
    122     计算完成f函数花费的时间
    123     """
    124     import time
    125     tic = time.time()
    126     f(*args)
    127     toc = time.time()
    128     return toc - tic
    129 
    130 two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
    131 print('Two loop version took %f seconds' % two_loop_time)
    132 
    133 one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
    134 print('One loop version took %f seconds' % one_loop_time)
    135 
    136 no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
    137 print('No loop version took %f seconds' % no_loop_time)
    138 
    139 
    140 #使用交叉验证决定k值
    141 num_folds = 5
    142 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
    143 
    144 X_train_folds = []
    145 y_train_folds = []
    146 X_train_folds=np.array_split(X_train,num_folds)
    147 y_train_folds=np.array_split(y_train,num_folds)
    148 print(X_train_folds[0].shape)
    149 print(y_train_folds[0].shape)
    150 
    151 
    152 #记录不同的k值对应的正确率,每个k值会对应num_folds个正确率
    153 k_to_accuracies = {}
    154 
    155 for k_ in k_choices:
    156     k_to_accuracies.setdefault(k_, [])
    157 for i in range(num_folds):
    158     classifier = KNearestNeighbor()
    159     X_val_train = np.concatenate(X_train_folds[0:i] + X_train_folds[i+1:],axis=0)
    160     y_val_train = np.concatenate(y_train_folds[0:i] + y_train_folds[i+1:],axis=0)
    161     classifier.train(X_val_train, y_val_train)
    162     for k_ in k_choices:
    163         y_val_pred = classifier.predict(X_train_folds[i], k=k_)
    164         num_correct = np.sum(y_val_pred == y_train_folds[i])
    165         accuracy = float(num_correct) / len(y_val_pred)
    166         k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy]
    167 
    168 
    169 
    170 for k in sorted(k_to_accuracies):
    171     for accuracy in k_to_accuracies[k]:
    172         print('k = %d, accuracy = %f' % (k, accuracy))
    173 
    174 
    175 for k in k_choices:
    176     accuracies = k_to_accuracies[k]
    177     plt.scatter([k] * len(accuracies), accuracies)
    178 
    179 
    180 accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
    181 accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
    182 plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
    183 plt.title('Cross-validation on k')
    184 plt.xlabel('k')
    185 plt.ylabel('Cross-validation accuracy')
    186 plt.show()
    187 
    188 
    189 #选择最好的k值计算正确率
    190 best_k = k_choices[np.argmax(accuracies_mean)]
    191 
    192 classifier = KNearestNeighbor()
    193 classifier.train(X_train, y_train)
    194 y_test_pred = classifier.predict(X_test, k=best_k)
    195 
    196 
    197 num_correct = np.sum(y_test_pred == y_test)
    198 accuracy = float(num_correct) / num_test
    199 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
  • 相关阅读:
    转:javax.net.ssl.SSLHandshakeException: Received fatal alert: handshake_failure 解决方案
    Elementui 导航组件和Vuejs路由结合
    python 在线生成文字云
    TensorFlow创建简单的图片分类系统--机器学习
    kettle maven 配置
    Kettle api 二次开发之 日志的保存
    heatmap for arcgisjsapi
    Spring MVC 使用tomcat中配置的数据源
    点坐标旋转方法
    在servlet中使用Spring注入
  • 原文地址:https://www.cnblogs.com/super-JJboom/p/9725024.html
Copyright © 2011-2022 走看看