怎样计算两个数据的距离(相近程度)
假如两个数据有A B,有两个属性 p1 p2
那么,A与B的距离 distance = 开平方( 平方(A.p1 - B.p1)+ 平方(A.p2 - B.p2) )
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
df = pd.read_csv("./titanic_train.csv")
#drop NAN datas
df = df.dropna()
#75%用来训练
trainCount = int(df.shape[0]*0.75)
df_train = df.iloc[:trainCount]
df_test = df.iloc[trainCount:]
#选择哪些列用来做比较
cols = ['Age','Pclass']
#初始化KNeighborsRegressor,指定取前10个最相似的近邻的平均值,默认是5个
knn = KNeighborsRegressor(n_neighbors=10)
#开始训练,目标是Fare的值
knn.fit(df_train[cols],df_train['Fare'])
#用测试集预测Fare值
predictions = knn.predict(df_test[cols])
print(predictions)
print(df_test['Fare'].values)
#评估测试集
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(df_test['Fare'] , predictions)
rmse = mse ** (1/2)
print(rmse)