zoukankan html css js c++ java

Feature Selection Can Reduce Overfitting And RF Show Feature Importance

一、特征选择可以减少过拟合代码实例

　该实例来自机器学习实战第四章

#coding=utf-8

'''
We use KNN to show that feature selection maybe reduce overfitting
'''

from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class SBS():
          def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
                    self.scoring = scoring
                    self.estimator = clone(estimator)
                    self.k_features = k_features
                    self.test_size = test_size
                    self.random_state = random_state

          def fit(self, X, y):
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = self.test_size, random_state=self.random_state)
                    dim = X_train.shape[1]

                    self.indices_ = tuple(range(dim))
                    self.subsets_ = [self.indices_]
                    score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)

                    self.scores_ = [score]

                    while dim > self.k_features:
                              scores = []
                              subsets = []

                              for p in combinations(self.indices_, r=dim-1):
                                        score = self._calc_score(X_train, y_train, X_test, y_test, p)
                                        scores.append(score)
                                        subsets.append(p)
                              best = np.argmax(scores)
                              self.indices_ = subsets[best]
                              self.subsets_.append(self.indices_)
                              dim -= 1

                              self.scores_.append(scores[best])

                    self.k_score_ = self.scores_[-1]

                    return self

          def transform(self, X):
                    return X[:, self.indices_]

          def _calc_score(self, X_train, y_train, X_test, y_test, indices):
                    self.estimator.fit(X_train[:, indices], y_train)
                    y_pred = self.estimator.predict(X_test[:, indices])
                    score = self.scoring(y_test, y_pred)

                    return score

import pandas as pd
from sklearn.model_selection import train_test_split

df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol',
                   'Malic acid',
                   'Ash',
                   'Alcalinity of ash',
                   'Magnesium',
                   'Total phenols',
                   'Flavanoids',
                   'Nonflavanoid phenols',
                   'Proanthocyanins',
                   'Color intensity',
                   'Hue',
                   'OD280/OD315 of diluted wines',
                   'Proline']

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)


from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)

k_feat = [len(k) for k in sbs.subsets_]

plt.figure(figsize=(8,10))#must be a tuple
plt.subplot(2,1,1)
plt.plot(k_feat, sbs.scores_, marker='o')
plt.ylim([0.7, 1.1])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
#plt.show()

#Let's see what those five features are that yield such a good performance on validation dataset
#subsets_的第九个元素是选择了13个特征中的五个来进行训练
k5 = list(sbs.subsets_[8])
print(df_wine.columns[1:][k5])
'''
Index(['Alcohol', 'Malic acid', 'Alcalinity of ash', 'Hue', 'Proline'], dtype='object')
'''

#Let's evaluate the performance of the KNN classifer on the original test set
knn.fit(X_train_std, y_train)
print("Training Accuracy:", knn.score(X_train_std, y_train))
print("Test Accuracy:", knn.score(X_test_std, y_test))
'''
Training accuracy: 0.9838709677419355
Test Accuracy: 0.9444444444444444
'''
#We find a slight degree of overftting if we used all the 13 features on training


knn.fit(X_train_std[:, k5], y_train)
print("Training Accuracy:", knn.score(X_train_std[:, k5], y_train))
print("Test Accuracy:", knn.score(X_test_std[:, k5], y_test))
'''
Training Accuracy: 0.9596774193548387
Test Accuracy: 0.9629629629629629
'''
#We reduced overfitting and the prediction accuracy improved.

#RF Show Feature Importance
from sklearn.ensemble import RandomForestClassifier
feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]


for f in range(X_train.shape[1]):
          print("%2d) %-*s %f" % (f+1, 30,  feat_labels[indices[f]], importances[indices[f]]))

plt.subplot(2,1,2)
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center')
plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

查看全文

相关阅读:
大数据集群迁移的那一夜是怎么过的
 什么是Lambda架构
 从0到1搭建自助分析平台
 业务重要？还是技术重要？
如何从0到1搭建大数据平台
 从0到1搭建大数据平台之数据采集系统
 高频面试题：秒杀场景设计
 面试官：面对千万级、亿级流量怎么处理？
来自朋友最近阿里、腾讯、美团等P7岗位面试题
 《我想进大厂》之JVM夺命连环10问

原文地址：https://www.cnblogs.com/always-fight/p/9418793.html