import numpy as np import pandas as pd from Udacity.model_check.boston_house_price import visuals as vs # Supplementary code from sklearn.model_selection import ShuffleSplit # Pretty display for notebooks # 让结果在notebook中显示 # Load the Boston housing dataset # 载入波士顿房屋的数据集 data = pd.read_csv('housing.csv') prices = data['MEDV'] features = data.drop('MEDV', axis=1) # print(data.describe()) # Success # 完成 print("Boston housing dataset has {} data points with {} variables each.".format(*data.shape)) # 目标:计算价值的最小值 minimum_price = np.min(data['MEDV']) # 目标:计算价值的最大值 maximum_price = np.max(data['MEDV']) # 目标:计算价值的平均值 mean_price = np.mean(data['MEDV']) # 目标:计算价值的中值 median_price = np.median(data['MEDV']) # 目标:计算价值的标准差 std_price = np.std(data['MEDV']) # 目标:输出计算的结果 print("Statistics for Boston housing dataset: ") print("Minimum price: ${:,.2f}".format(minimum_price)) print("Maximum price: ${:,.2f}".format(maximum_price)) print("Mean price: ${:,.2f}".format(mean_price)) print("Median price ${:,.2f}".format(median_price)) print("Standard deviation of prices: ${:,.2f}".format(std_price)) # RM,LSTAT,PTRATIO,MEDV """ 初步分析结果是 1.RM越大MEDV越大 2.LSTATA越大MEDV越小 3.PTRATIO越大MEDV越小 """ # TODO: Import 'r2_score' def performance_metric(y_true, y_predict): """ Calculates and returns the performance score between true and predicted values based on the metric chosen. """ from sklearn.metrics import r2_score # TODO: Calculate the performance score between 'y_true' and 'y_predict' score = r2_score(y_true,y_predict) # Return the score return score # score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3]) # print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score)) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.80, random_state=1) # Success print ("Training and testing split was successful.") # vs.ModelLearning(features, prices) def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import KFold # Create cross-validation sets from the training data cross_validator = KFold(10) # cv_sets = ShuffleSplit(X.shape[0], test_size=0.20, random_state=0) # TODO: Create a decision tree regressor object regressor = DecisionTreeRegressor() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 max_depth = [1,2,3,4,5,6,7,8,9,10] params = {"max_depth":max_depth} from sklearn.metrics import make_scorer # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) from sklearn.model_selection import GridSearchCV # TODO: Create the grid search object grid = GridSearchCV(regressor,params,scoring_fnc,cv=cross_validator) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_ reg = fit_model(X_train, y_train) # Produce the value for 'max_depth' print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth'])) client_data = [[5, 17, 15], # Client 1 [4, 32, 22], # Client 2 [8, 3, 12]] # Client 3 # Show predictions for i, price in enumerate(reg.predict(client_data)): print ("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))