简单:
一、手动写一个KNN算法解决分类问题
from sklearn import datasets from collections import Counter # 为了做投票 from sklearn.model_selection import train_test_split import numpy as np # 导入iris数据 iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2003) def euc_dis(instance1, instance2): """ 计算两个样本instance1和instance2之间的欧式距离 instance1: 第一个样本, array型 instance2: 第二个样本, array型 """ # TODO dist = np.sqrt(sum((instance1-instance2)**2)) return dist def knn_classify(X, y, testInstance, k): """ 给定一个测试数据testInstance, 通过KNN算法来预测它的标签。 X: 训练数据的特征 y: 训练数据的标签 testInstance: 测试数据,这里假定一个测试数据 array型 k: 选择多少个neighbors? """ # TODO 返回testInstance的预测标签 = {0,1,2} distances = [euc_dis(x,testInstance) for x in X] kneighbors = np.argsort(distances)[:k] count = Counter(y[kneighbors]) return count. most_common()[0][0] # 预测结果。 predictions = [knn_classify(X_train, y_train, data, 3) for data in X_test] correct = np.count_nonzero((predictions==y_test)==True) print ("Accuracy is: %.3f" %(correct/len(X_test)))
二、K折交叉验证选择合适的的K值
import numpy as np from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import KFold #主要用于k折交叉验证 #导入iris数据集 iris = datasets.load_iris() X = iris.data Y = iris.target print(X.shape,Y.shape) #定义我们想要使用的K值(候选集) ks = [1,3,5,7,9,11,13,15] ''' 进行5折交叉验证,KFlod返回的是每一折中训练数据和验证数据的index 返回的kf格式为(前面的是训练集,后面的是验证集): [0,1,3,5,6,7,8,9],[2,4] [0,1,2,4,6,7,8,9],[3,5] [1,2,3,4,5,6,7,8],[0,9] [0,1,2,3,4,5,7,9],[6.8] [0,2,3,4,5,6,8,9],[1,7] ''' kf = KFold(n_splits = 5,random_state=2001,shuffle=True) #保存当前最好的k值和对应的准确率值 best_k = ks[0] best_score = 0 #循环每一个k值 for k in ks: curr_score = 0 for train_index,valid_index in kf.split(X): # 每一折的训练以及计算准确率 clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X[train_index],Y[train_index]) curr_score = curr_score + clf.score(X[valid_index],Y[valid_index]) # 求一下5折的平均准确率 avg_score = curr_score/5 if avg_score > best_score: best_k = k best_score = avg_score print ("current best score is: %.2f"%best_score,"best k: %d"%best_k) print ("after cross validation, the final best k is: %d"%best_k)
使用sklearn方法来实现:
from sklearn.model_selection import GridSearchCV # 通过网格方式来搜索参数 from sklearn import datasets from sklearn.neighbors import KNeighborsClassifier iris = datasets.load_iris() X = iris.data Y = iris.target # 设置需要搜索的k值,'n_neighbors'是sklearn中KNN的参数 parameters = {'n_neighbors':[1,3,5,7,9,11,13,15]} knn = KNeighborsClassifier() #通过GridSearchCV来搜索最好的K值,这个模块的内部其实就是对于每一个k值做了评估 clf = GridSearchCV(knn,parameters,cv=5) clf.fit(X,Y) #输出最好的参数以及对应的准确率 print("best score is:%.2f"%clf.best_score_," best param: ",clf.best_params_)