关于随机森林样本和分类目标的示例
注意:
1.目标类别是3个以上(逻辑分类只能两个)
2.自变量X以行为单位
3.因变量y以列为单位(每一个值对应X的一行)
4.其它不用管了,交给程序去吧
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 09 17:40:04 2016
@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 09 16:15:03 2016
@author: Administrator
"""
#随机森林演示
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
if __name__ == '__main__':
'''
df = pd.read_csv('ad.data', header=None)
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
# The last column describes the targets
explanatory_variable_columns.remove(len(df.columns.values)-1)
y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)]
X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)
'''
X = np.array([[0,0,0,0],
[0,0,0,1],
[0,0,1,0],
[0,0,1,1],
[0,1,0,0],
[0,1,0,1],
[0,1,1,0],
[0,1,1,1],
[1,0,0,0],
[1,0,0,1],
[1,0,1,0],
[1,0,1,1],
[1,1,0,0],
[1,1,0,1],
[1,1,1,0],
[1,1,1,1]])
y = np.array([0,1,1,0,2,1,0,0,0,2,1,0,2,1,0,0]) #就要是一行向量(如果是多行,会报错)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline = Pipeline([
('clf', RandomForestClassifier(criterion='entropy'))
])
parameters = {
'clf__n_estimators': (5, 10, 20, 50),
'clf__max_depth': (50, 150, 250),
'clf__min_samples_split': (1, 2, 3),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print ' %s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)