XGBClassifier是xgboost的sklearn版本。代码完整的展示了使用xgboost建立模型的过程,并比较xgboost和randomForest的性能。
1 # -*- coding: utf-8 -*- 2 """ 3 # 作者:wanglei5205 4 # 邮箱:wanglei5205@126.com 5 # 博客:http://cnblogs.com/wanglei5205 6 # github:http://github.com/wanglei5205 7 """ 8 ### 导入模块 9 import pandas as pd 10 11 ### load_data 12 titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') 13 X = titanic[['pclass', 'age', 'sex']] # 输入空间 14 y = titanic['survived'] # 输出空间 15 X.age.fillna(X.age.mean(), inplace=True) # 填充缺失值(均值),inplace=True(无返回值,原地替换) 16 17 ### split_data 18 from sklearn.cross_validation import train_test_split 19 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) 20 21 ### feature_extraction 22 from sklearn.feature_extraction import DictVectorizer 23 vec = DictVectorizer(sparse=False) 24 X_train = vec.fit_transform(X_train.to_dict(orient='record')) 25 X_test = vec.transform(X_test.to_dict(orient='record')) 26 27 ### create_model 28 # rfc 29 from sklearn.ensemble import RandomForestClassifier 30 rfc = RandomForestClassifier() 31 rfc.fit(X_train, y_train) 32 33 # xgbc 34 from xgboost import XGBClassifier 35 xgbc = XGBClassifier() 36 xgbc.fit(X_train, y_train) 37 38 ### model_score 39 print ('rfc.score=',rfc.score(X_test, y_test)) 40 print('xgbc.score=',xgbc.score(X_test, y_test)) 41 """ 42 rfc.score= 0.787234042553 43 xgbc.score= 0.787234042553 44 """