from sklearn.model_selection import GridSearchCV
param_grid = {'C': np.arange(1e-05, 3, 0.1)}
scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss'}
gs = GridSearchCV(LogisticRegression(), return_train_score=True,
param_grid=param_grid, scoring=scoring, cv=10, refit='Accuracy')
def train_model(model, param_grid=[], X=[], y=[],
splits=5, repeats=5):
# get unmodified training data, unless data to use already specified
if len(y) == 0:
X, y = get_training_data()
# create cross-validation method
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
# perform a grid search if param_grid given
if len(param_grid) > 0:
# setup grid search parameters
gsearch = GridSearchCV(model, param_grid, cv=rkfold,
scoring=rmse_scorer,
verbose=1, return_train_score=True)
# search the grid
gsearch.fit(X, y)
# extract best model from the grid
model = gsearch.best_estimator_
best_idx = gsearch.best_index_
# get cv-scores for best model
grid_results = pd.DataFrame(gsearch.cv_results_)
cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
cv_std = grid_results.loc[best_idx, 'std_test_score']
# no grid search, just cross-val score for given model
else:
grid_results = []
cv_results = cross_val_score(model, X, y, scoring=rmse_scorer, cv=rkfold)
cv_mean = abs(np.mean(cv_results))
cv_std = np.std(cv_results)
# combine mean and std cv-score in to a pandas series
cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})
# predict y using the fitted model
y_pred = model.predict(X)
# print stats on model performance
print('----------------------')
print(model)
print('----------------------')
print('score=', model.score(X, y))
print('rmse=', rmse(y, y_pred))
print('cross_val: mean=', cv_mean, ', std=', cv_std)
# residual plots
y_pred = pd.Series(y_pred, index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = (resid - mean_resid) / std_resid
n_outliers = sum(abs(z) > 3)
plt.figure(figsize=(15, 5))
ax_131 = plt.subplot(1, 3, 1)
plt.plot(y, y_pred, '.')
plt.xlabel('y')
plt.ylabel('y_pred');
plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1]))
ax_132 = plt.subplot(1, 3, 2)
plt.plot(y, y - y_pred, '.')
plt.xlabel('y')
plt.ylabel('y - y_pred');
plt.title('std resid = {:.3f}'.format(std_resid))
ax_133 = plt.subplot(1, 3, 3)
z.plot.hist(bins=50, ax=ax_133)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
return model, cv_score, grid_results
def find_outliers(model, X, y, sigma=3):
# predict y values using model
try:
y_pred = pd.Series(model.predict(X), index=y.index)
# if predicting fails, try fitting the model first
except:
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index=y.index)
# calculate residuals between the model prediction and true y values
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
# calculate z statistic, define outliers to be where |z|>sigma
z = (resid - mean_resid) / std_resid
outliers = z[abs(z) > sigma].index
# print and plot the results
print('R2=', model.score(X, y))
print('rmse=', rmse(y, y_pred))
print('---------------------------------------')
print('mean of residuals:', mean_resid)
print('std of residuals:', std_resid)
print('---------------------------------------')
print(len(outliers), 'outliers:')
print(outliers.tolist())
plt.figure(figsize=(15, 5))
ax_131 = plt.subplot(1, 3, 1)
plt.plot(y, y_pred, '.')
plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred');
ax_132 = plt.subplot(1, 3, 2)
plt.plot(y, y - y_pred, '.')
plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y - y_pred');
ax_133 = plt.subplot(1, 3, 3)
z.plot.hist(bins=50, ax=ax_133)
z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('z')
# plt.savefig('outliers.png')
return outliers
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
def rmse(y_true, y_pred):
diff = y_pred - y_true
sum_sq = sum(diff ** 2)
n = len(y_pred)
return np.sqrt(sum_sq / n)
# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
pca = decomposition.PCA().fit(X)
plt.figure(figsize=(10,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
original_columns = list(df.columns)
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
new_columns = [c for c in df.columns if c not in original_columns]
return df, new_columns
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
for col in bb_cat:
bb_aggregations[col] = ['mean']
bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])