zoukankan      html  css  js  c++  java
  • Feature Selection Can Reduce Overfitting And RF Show Feature Importance

    一、特征选择可以减少过拟合代码实例

     该实例来自机器学习实战第四章

    #coding=utf-8
    
    '''
    We use KNN to show that feature selection maybe reduce overfitting
    '''
    
    from sklearn.base import clone
    from itertools import combinations
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    
    class SBS():
              def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
                        self.scoring = scoring
                        self.estimator = clone(estimator)
                        self.k_features = k_features
                        self.test_size = test_size
                        self.random_state = random_state
    
              def fit(self, X, y):
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = self.test_size, random_state=self.random_state)
                        dim = X_train.shape[1]
    
                        self.indices_ = tuple(range(dim))
                        self.subsets_ = [self.indices_]
                        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
    
                        self.scores_ = [score]
    
                        while dim > self.k_features:
                                  scores = []
                                  subsets = []
    
                                  for p in combinations(self.indices_, r=dim-1):
                                            score = self._calc_score(X_train, y_train, X_test, y_test, p)
                                            scores.append(score)
                                            subsets.append(p)
                                  best = np.argmax(scores)
                                  self.indices_ = subsets[best]
                                  self.subsets_.append(self.indices_)
                                  dim -= 1
    
                                  self.scores_.append(scores[best])
    
                        self.k_score_ = self.scores_[-1]
    
                        return self
    
              def transform(self, X):
                        return X[:, self.indices_]
    
              def _calc_score(self, X_train, y_train, X_test, y_test, indices):
                        self.estimator.fit(X_train[:, indices], y_train)
                        y_pred = self.estimator.predict(X_test[:, indices])
                        score = self.scoring(y_test, y_pred)
    
                        return score
    
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
    df_wine.columns = ['Class label', 'Alcohol',
                       'Malic acid',
                       'Ash',
                       'Alcalinity of ash',
                       'Magnesium',
                       'Total phenols',
                       'Flavanoids',
                       'Nonflavanoid phenols',
                       'Proanthocyanins',
                       'Color intensity',
                       'Hue',
                       'OD280/OD315 of diluted wines',
                       'Proline']
    
    X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    from sklearn.preprocessing import StandardScaler
    stdsc = StandardScaler()
    X_train_std = stdsc.fit_transform(X_train)
    X_test_std = stdsc.transform(X_test)
    
    
    from sklearn.neighbors import KNeighborsClassifier
    import matplotlib.pyplot as plt
    knn = KNeighborsClassifier(n_neighbors=2)
    sbs = SBS(knn, k_features=1)
    sbs.fit(X_train_std, y_train)
    
    k_feat = [len(k) for k in sbs.subsets_]
    
    plt.figure(figsize=(8,10))#must be a tuple
    plt.subplot(2,1,1)
    plt.plot(k_feat, sbs.scores_, marker='o')
    plt.ylim([0.7, 1.1])
    plt.ylabel('Accuracy')
    plt.xlabel('Number of features')
    plt.grid()
    #plt.show()
    
    #Let's see what those five features are that yield such a good performance on validation dataset
    #subsets_的第九个元素是选择了13个特征中的五个来进行训练
    k5 = list(sbs.subsets_[8])
    print(df_wine.columns[1:][k5])
    '''
    Index(['Alcohol', 'Malic acid', 'Alcalinity of ash', 'Hue', 'Proline'], dtype='object')
    '''
    
    #Let's evaluate the performance of the KNN classifer on the original test set
    knn.fit(X_train_std, y_train)
    print("Training Accuracy:", knn.score(X_train_std, y_train))
    print("Test Accuracy:", knn.score(X_test_std, y_test))
    '''
    Training accuracy: 0.9838709677419355
    Test Accuracy: 0.9444444444444444
    '''
    #We find a slight degree of overftting if we used all the 13 features on training
    
    
    knn.fit(X_train_std[:, k5], y_train)
    print("Training Accuracy:", knn.score(X_train_std[:, k5], y_train))
    print("Test Accuracy:", knn.score(X_test_std[:, k5], y_test))
    '''
    Training Accuracy: 0.9596774193548387
    Test Accuracy: 0.9629629629629629
    '''
    #We reduced overfitting and the prediction accuracy improved.
    
    #RF Show Feature Importance
    from sklearn.ensemble import RandomForestClassifier
    feat_labels = df_wine.columns[1:]
    forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
    forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    
    for f in range(X_train.shape[1]):
              print("%2d) %-*s %f" % (f+1, 30,  feat_labels[indices[f]], importances[indices[f]]))
    
    plt.subplot(2,1,2)
    plt.title("Feature Importances")
    plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center')
    plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.tight_layout()
    plt.show()
    

  • 相关阅读:
    layui多选框
    js获取html5 audio 音频时长方法
    危害程序员职业生涯的三大观念
    选择器
    C++ STL partial_sort
    C++ STL sort
    C++ STL 排列 next_permutation prev_permutation
    C++ STL 逆转旋转 reverse reverse_copy rotate
    C++ unique
    C++ remove remove_if erase
  • 原文地址:https://www.cnblogs.com/always-fight/p/9418793.html
Copyright © 2011-2022 走看看