zoukankan      html  css  js  c++  java
  • Knn算法实现

    Knn算法实现

     

    k近邻算法

     

    0.引入依赖

    In [8]:
    import numpy as np
    import pandas as pd
    
    #这里直接引入sklearn里面的数据集,iris 鸢尾花
    from sklearn.datasets  import  load_iris
    from sklearn.model_selection import train_test_split   # 切分数据集为训练集和测试集
    from sklearn.metrics import accuracy_score   #计算分类预测的准确率
    
     

    1.数据加载和预处理

    In [23]:
    iris = load_iris()
    df = pd.DataFrame(data=iris.data, columns = iris.feature_names)
    df['class'] = iris.target
    df['class'] = df['class'].map( lambda  i:iris.target_names[i] )
    df.describe()
    
    Out[23]:
     
     sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
    count 150.000000 150.000000 150.000000 150.000000
    mean 5.843333 3.057333 3.758000 1.199333
    std 0.828066 0.435866 1.765298 0.762238
    min 4.300000 2.000000 1.000000 0.100000
    25% 5.100000 2.800000 1.600000 0.300000
    50% 5.800000 3.000000 4.350000 1.300000
    75% 6.400000 3.300000 5.100000 1.800000
    max 7.900000 4.400000 6.900000 2.500000
    In [24]:
    x = iris.data
    y = iris.target.reshape(-1,1)
    
    In [33]:
    #划分训练接和测试集
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=35,stratify = y)
    
    Out[33]:
    array([1.7, 1. , 1.3, 1.5, 3.9, 1.8, 2.1, 7. , 6.2, 0.5, 1.9, 6.2, 2.7,
           7.1, 6.9, 0. , 2. , 2.6, 1.9, 2.3, 2.6, 6.7, 3.8, 7.1, 6.7, 4.9,
           2.2, 2.1, 2.7, 1.3, 2. , 0.8, 2.7, 2.6, 1.4, 1.9, 3.7, 6.9, 2.3,
           2.2, 1.9, 1.2, 1.7, 6.6, 0.5, 6.8, 6.9, 2.5, 6.2, 6.8, 6.7, 3.6,
           7. , 1.5, 1.7, 2.1, 2.7, 3. , 2.2, 1.8, 1.8, 1.7, 2.7, 7.2, 6.9,
           2.9, 7.2, 1.4, 2.9, 2.2, 4.2, 1.5, 6.6, 6.1, 1.5, 4.6, 6.5, 1.4,
           1.3, 0.5, 3.8, 6.3, 6.8, 6.6, 1.8, 2.5, 7.4, 2.6, 6.8, 6.8, 4. ,
           1.7, 7.1, 6.5, 7.9, 1.4, 2.4, 6.6, 6.4, 7.3, 1.9, 1.8, 7.6, 0.9,
           0.8])
    In [102]:
    arr=np.argsort(np.array([1,5,3,4]))[:3]
    test=[np.array([1,5,3,4])[a] for a in arr]
    test_2=np.array([1,5,3,4])[arr]
    test_2.tolist().count(1)
    
    Out[102]:
    1
    In [109]:
    np.argmax([1,5,3,4])
    # np.bincount([1,1,2,3,'1x'])
    
     
    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    <ipython-input-109-feb333e2c58a> in <module>
          1 np.argmax([1,5,3,4])
    ----> 2np.bincount([1,1,2,3,'1x'])
    
    ValueError: invalid literal for int() with base 10: '1x'
     

    2.核心算法实现

    In [150]:
    # 距离函数定义
    def l1_distance(a,b):
        return np.sum(np.abs(a-b),axis=1)
    def l2_distance(a,b):
         return np.sqrt(np.sum((a-b)**2,axis=1))
    
    
    # 分类器实现
    class kNN(object):
        #定义一个初始化方法, __init__ 是类的构造方法
        def __init__(self,n_neighbors=1,dist_func= l1_distance):
            self.n_neighbors=n_neighbors
            self.dist_func=dist_func
        
        # 训练模型的方法
        def fit(self,x,y):
            self.x_train = x
            self.y_train = y
            
        # 模型预测
        def predict(self, x):
            # 初始化预测分类数组
            y_pred = np.zeros((x.shape[0],1),dtype=self.y_train.dtype)
            #遍历输入的x数据点
            for i,x_test  in enumerate(x):
                # x_test和所有训练数据计算距离
                distances=self.dist_func(self.x_train,x_test)
                # 对得到的距离按照由近到远排序
                nn_indexes=np.argsort(distances)[:self.n_neighbors]
                #选取其中最近的k个点,统计类别出现频率最高的那个,赋给y_predict[i]
    #             y_res=[y_train[a] for a in nn_indexes]
                y_res=y_train[nn_indexes].ravel().tolist()
    #             y_pred[i] = np.argmax([y_res.count(0),y_res.count(1),y_res.count(2)])
                y_pred[i] = np.argmax(np.bincount(y_res))
            return y_pred
    
    In [160]:
    kNN_model=kNN(n_neighbors=5,dist_func= l1_distance)
    kNN_model.fit(x_train,y_train)
    y_pred=kNN_model.predict(x_test)
    
    In [161]:
    accuracy_score(y_test,y_pred)
    
    Out[161]:
    0.9777777777777777
    In [166]:
    #比对各个参数的好坏
    knn=kNN()
    knn.fit(x_train,y_train)
    result_list=[]
    for p in [1,2]:
      knn.dist_func=l1_distance if p==1 else l2_distance
      #考虑不同的k取值
      for k in range(1,10,2):
            knn.n_neighbors=k
            y_pred=knn.predict(x_test)
            accuracy= accuracy_score(y_test,y_pred)
            print(accuracy)
            result_list.append([knn.n_neighbors,knn.dist_func.__name__,accuracy])
    df = pd.DataFrame(result_list,columns=['k',"距离函数","准确率"])      
    df
    
     
    0.9333333333333333
    0.9333333333333333
    0.9777777777777777
    0.9555555555555556
    0.9555555555555556
    0.9333333333333333
    0.9333333333333333
    0.9777777777777777
    0.9777777777777777
    0.9777777777777777
    
    Out[166]:
     
     k距离函数准确率
    0 1 l1_distance 0.933333
    1 3 l1_distance 0.933333
    2 5 l1_distance 0.977778
    3 7 l1_distance 0.955556
    4 9 l1_distance 0.955556
    5 1 l2_distance 0.933333
    6 3 l2_distance 0.933333
    7 5 l2_distance 0.977778
    8 7 l2_distance 0.977778
    9 9 l2_distance 0.977778
    In [ ]:
     
    In [ ]:
     
  • 相关阅读:
    Rock the Tech Interview
    k-d Tree in TripAdvisor
    Randomized QuickSelect
    Kth Smallest Element in Unsorted Array
    Quick Sort
    LRU Cache 解答
    Implement Queue using Stacks 解答
    Implement Stack using Queues 解答
    ListNode Review ReverseListNode
    BackTracking
  • 原文地址:https://www.cnblogs.com/arli/p/11440430.html
Copyright © 2011-2022 走看看