#coding=utf8 import numpy as np import pandas as pd from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor from xgboost import XGBRegressor #不要第一列id,只是作为索引 train_df = pd.read_csv('./input/train.csv', index_col=0) test_df = pd.read_csv('./input/test.csv', index_col=0) prices = pd.DataFrame({'price':train_df['SalePrice'], 'log(price + 1)':np.log1p(train_df['SalePrice'])}) #print train_df.columns #prices.hist() #print 'ok' #print train_df.index #print test_df.index y_train = np.log1p(train_df.pop('SalePrice')) #print y_train.shape #print train_df.index all_df = pd.concat((train_df,test_df), axis=0) #变量转换 print train_df.index print test_df.index #print all_df['MSSubClass'].dtypes all_df['MSSubClass'] = all_df['MSSubClass'].astype(str) #print all_df.shape #print all_df['MSSubClass'].value_counts() #print all_df['MSSubClass'].dtypes #print pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head() #当我们用numerical来表达categorical的时候,要注意,数字本身有大小的含义,所以乱用数字会给之后的模型学习带来麻烦。于是我们可以用One-Hot的方法来表达category。 #pandas自带的get_dummies方法,一键做到One-Hot。 #把所有的category数据,都给One-Hot了 all_dummy_df = pd.get_dummies(all_df) #print all_dummy_df.head() #print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) #处理缺失值 mean_cols = all_dummy_df.mean() #print mean_cols all_dummy_df = all_dummy_df.fillna(mean_cols) #print all_dummy_df.isnull().sum().sum() #标准化numerical数据,这里,我们当然不需要把One-Hot的那些0/1数据给标准化。我们的目标应该是那些本来就是numerical的数据: #先来看看 哪些是numerical的 numeric_cols = all_df.columns[all_df.dtypes != 'object'] #print numeric_cols #print train_df.index numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean() numeric_col_std = all_dummy_df.loc[:, numeric_cols].std() all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] #print train_df.index #print test_df.index #print dummy_train_df.shape #print dummy_test_df.shape #print type(dummy_train_df) X_train = dummy_train_df.values X_test = dummy_test_df.values #print type(X_train) print X_train.shape alphas = np.logspace(-3, 2, 50) test_scores = [] for alpha in alphas: clf = Ridge(alpha) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(alphas, test_scores) plt.title('Alpha vs CV Error') max_features = [.1, .3, .5, .7, .9, .99] test_scores = [] for max_feat in max_features: clf = RandomForestRegressor(n_estimators=200, max_features=max_feat) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(max_features, test_scores) plt.title("Max Features vs CV Error") #做一点高级的Ensemble #这里,可以不必输入Base_estimator,使用自带的,但是结果不及已经调好的 base_estimator,通过作图可以验证。 ridge = Ridge(alpha=15) #Bagging params = [1, 10, 15, 20, 25, 30, 40] test_scores = [] for param in params: clf = BaggingRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #Boosting params = [10, 15, 20, 25, 30, 35, 40, 45, 50] test_scores = [] for param in params: clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #XGBoost params = [1,2,3,4,5,6] test_scores = [] for param in params: clf = XGBRegressor(max_depth=param) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("max_depth vs CV Error") """ rf = RandomForestRegressor(n_estimators=500, max_features=.3) ridge.fit(X_train, y_train) rf.fit(X_train, y_train) y_ridge = np.expm1(ridge.predict(X_test)) y_rf = np.expm1(rf.predict(X_test)) y_final = (y_ridge + y_rf) / 2 """