我们怎么评价得到的K近邻算法的性能?
K近邻算法训练后直接投入真实环境中使用可能会造成真实损失,我们需要判断该算法的准确度以及性能,怎么测试?
解决方法:将原始的训练数据分隔成两部分,一部分作为测试数据,一部分作为训练数据,即train_test_spilt
代码如下:
import math
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import datasets
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['axes.unicode_minus']=False #对负号进行设置
class KNNClassfiy(object):
def __init__(self,k):
assert k>=1,'k must be valid'
self.k=k
self._xTrain=None
self._yTrain=None
def fit(self,xTrain,yTrain):
assert xTrain.shape[0]==yTrain.shape[0],
'The size of xTrain must be equals to the size of yTrain'
assert self.k>=1 and self.k<=xTrain.shape[0],
'The size of xTrain must be least at k'
self._xTrain=xTrain
self._yTrain=yTrain
return self
def predict(self,X_predict):
# x是预测数据
assert X_predict.shape[1]==self._xTrain.shape[1],
'The feature of x must be equal to xTrain'
assert self._xTrain is not None and self._yTrain is not None,
'must fit before predict'
y_predict=[self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self,x):
distances=[math.sqrt(np.sum((xTrain-x)**2)) for xTrain in self._xTrain]
nearest=np.argsort(distances)
top_y=[self._yTrain[i] for i in nearest[:self.k]]
votes=Counter(top_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return self.k
def train_test_spilt(x,y,rate=0.2):
assert x.shape[0]==y.shape[0],'x must be same as y in shape[0]'
test_size=int(len(x)*rate)
# 测试数据的大小
shuff_index=np.random.permutation(len(x))
test_index=shuff_index[:test_size]
# 前面是测试数据的下标
train_index=shuff_index[test_size:]
# 后面是训练数据的下标
return x[test_index],x[train_index],y[test_index],y[train_index]
iris=datasets.load_iris();
x=iris.data
y=iris.target
x_test,x_train,y_test,y_train=train_test_spilt(x,y,0.2)
KNN_clf=KNNClassfiy(k=6);
xTrain=x_train
yTrain=y_train
KNN_clf.fit(xTrain=xTrain,yTrain=yTrain)
x_predict=np.array(x_test)
# 绘制出来
plt.scatter(xTrain[:,0],xTrain[:,1],label='训练数据',c='r',s=75)
plt.scatter(x_predict[:,0],x_predict[:,1],label='测试数据',c='b',s=75)
plt.legend()
res=KNN_clf.predict(x_predict)
percent=sum(res==y_test)/len(x_test)
print(percent)
没有设置随机数种子,当多次运行代码时,percent
不一定总是相等,可以添加seed
来改进。
绘制的散点图