zoukankan      html  css  js  c++  java
  • sklearn笔记

    from sklearn.model_selection import GridSearchCV
    param_grid = {'C': np.arange(1e-05, 3, 0.1)}
    scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss'}
    
    gs = GridSearchCV(LogisticRegression(), return_train_score=True,
                      param_grid=param_grid, scoring=scoring, cv=10, refit='Accuracy')
    def train_model(model, param_grid=[], X=[], y=[],
                    splits=5, repeats=5):
        # get unmodified training data, unless data to use already specified
        if len(y) == 0:
            X, y = get_training_data()
    
        # create cross-validation method
        rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
    
        # perform a grid search if param_grid given
        if len(param_grid) > 0:
            # setup grid search parameters
            gsearch = GridSearchCV(model, param_grid, cv=rkfold,
                                   scoring=rmse_scorer,
                                   verbose=1, return_train_score=True)
    
            # search the grid
            gsearch.fit(X, y)
    
            # extract best model from the grid
            model = gsearch.best_estimator_
            best_idx = gsearch.best_index_
    
            # get cv-scores for best model
            grid_results = pd.DataFrame(gsearch.cv_results_)
            cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])
            cv_std = grid_results.loc[best_idx, 'std_test_score']
    
        # no grid search, just cross-val score for given model
        else:
            grid_results = []
            cv_results = cross_val_score(model, X, y, scoring=rmse_scorer, cv=rkfold)
            cv_mean = abs(np.mean(cv_results))
            cv_std = np.std(cv_results)
    
        # combine mean and std cv-score in to a pandas series
        cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})
    
        # predict y using the fitted model
        y_pred = model.predict(X)
    
        # print stats on model performance
        print('----------------------')
        print(model)
        print('----------------------')
        print('score=', model.score(X, y))
        print('rmse=', rmse(y, y_pred))
        print('cross_val: mean=', cv_mean, ', std=', cv_std)
    
        # residual plots
        y_pred = pd.Series(y_pred, index=y.index)
        resid = y - y_pred
        mean_resid = resid.mean()
        std_resid = resid.std()
        z = (resid - mean_resid) / std_resid
        n_outliers = sum(abs(z) > 3)
    
        plt.figure(figsize=(15, 5))
        ax_131 = plt.subplot(1, 3, 1)
        plt.plot(y, y_pred, '.')
        plt.xlabel('y')
        plt.ylabel('y_pred');
        plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1]))
        ax_132 = plt.subplot(1, 3, 2)
        plt.plot(y, y - y_pred, '.')
        plt.xlabel('y')
        plt.ylabel('y - y_pred');
        plt.title('std resid = {:.3f}'.format(std_resid))
    
        ax_133 = plt.subplot(1, 3, 3)
        z.plot.hist(bins=50, ax=ax_133)
        plt.xlabel('z')
        plt.title('{:.0f} samples with z>3'.format(n_outliers))
    
        return model, cv_score, grid_results
    def find_outliers(model, X, y, sigma=3):
        # predict y values using model
        try:
            y_pred = pd.Series(model.predict(X), index=y.index)
        # if predicting fails, try fitting the model first
        except:
            model.fit(X, y)
            y_pred = pd.Series(model.predict(X), index=y.index)
    
        # calculate residuals between the model prediction and true y values
        resid = y - y_pred
        mean_resid = resid.mean()
        std_resid = resid.std()
    
        # calculate z statistic, define outliers to be where |z|>sigma
        z = (resid - mean_resid) / std_resid
        outliers = z[abs(z) > sigma].index
    
        # print and plot the results
        print('R2=', model.score(X, y))
        print('rmse=', rmse(y, y_pred))
        print('---------------------------------------')
    
        print('mean of residuals:', mean_resid)
        print('std of residuals:', std_resid)
        print('---------------------------------------')
    
        print(len(outliers), 'outliers:')
        print(outliers.tolist())
    
        plt.figure(figsize=(15, 5))
        ax_131 = plt.subplot(1, 3, 1)
        plt.plot(y, y_pred, '.')
        plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('y')
        plt.ylabel('y_pred');
    
        ax_132 = plt.subplot(1, 3, 2)
        plt.plot(y, y - y_pred, '.')
        plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('y')
        plt.ylabel('y - y_pred');
    
        ax_133 = plt.subplot(1, 3, 3)
        z.plot.hist(bins=50, ax=ax_133)
        z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
        plt.legend(['Accepted', 'Outlier'])
        plt.xlabel('z')
    
        # plt.savefig('outliers.png')
    
        return outliers
    def rmsle_cv(model):
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
        rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
        return(rmse)
    def rmse(y_true, y_pred):
    diff = y_pred - y_true
    sum_sq = sum(diff ** 2)
    n = len(y_pred)

    return np.sqrt(sum_sq / n)


    # scorer to be used in sklearn model fitting
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    pca = decomposition.PCA().fit(X)
    
    plt.figure(figsize=(10,7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_), color='k', lw=2)
    plt.xlabel('Number of components')
    plt.ylabel('Total explained variance')
    # One-hot encoding for categorical columns with get_dummies
    def one_hot_encoder(df, nan_as_category=True):
        original_columns = list(df.columns)
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
        df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
        new_columns = [c for c in df.columns if c not in original_columns]
        return df, new_columns
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
        for col in bb_cat:
            bb_aggregations[col] = ['mean']
        bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
        bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
  • 相关阅读:
    教你用photoshop cs5或者cs6做IPad,背景随意换,gif制作,高清教程,原创
    ASP.NET MVC4 IN ACTION学习笔记第一波
    潜移默化学会C#不常用语法《1》动态类型绑定dynamic
    SubSnoic 框架入门到提高(1)全程记录
    杨洋疯狂C# 刊号:201208 第1期ASPNET验证(一)
    杨洋疯狂C# 刊号:201207 第1期
    ASP.NET MVC4 IN ACTION学习笔记第二波
    JavaScript深入【表达式和运算符(上集)】你能过我8关js运算符的题目吗?
    清新空气我的.net(C#)生涯知识总结 跨CSS,JS,JAVA,AJAX,WPF,WCF,LINQ,ASP.NET,Winform,Sqlserver,Mysql,EF,OOP,开发工具等
    潜移默化学会WPF(Treeview异步加载节点)
  • 原文地址:https://www.cnblogs.com/figo-studypath/p/10008285.html
Copyright © 2011-2022 走看看