# 集成学习方法:集成学习是通过建立几个模型组合来解决单一预测问题,它的工作原理是生成对个分类器/模型,各自独立地学习和做出预测,最后结合成预测单预测,因此优于任何一个单分类做出的预测 # 随机森林就是包含多个决策树的分类器,并且输出的类别是由个别树输出的类别的众数而定 import pandas as pd from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier # 随机森林api from sklearn.model_selection import GridSearchCV titanic = pd.read_csv(r"E:360DownloadsSoftware ableauTableaudata itanic_passenger_list.csv") # print(titanic.columns) x = titanic[['pclass', 'age', 'sex']] y = titanic['survived'] # 处理年龄中的缺失值 x['age'].fillna(x['age'].mean(), inplace=True) # 分割数据成训练集和数据集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 进行特征处理,将分类型数据转换成one-host编码 dictv = DictVectorizer(sparse=False) # x_train.to_dict(orient="records")将其转换为字典records将一行转换成一个字典 x_train = dictv.fit_transform(x_train.to_dict(orient="records")) x_test = dictv.fit_transform(x_test.to_dict(orient="records")) # 随机森林进行预测(参数调优) rf = RandomForestClassifier() # 构造参数 param = {'n_estimators':[120, 200, 300], 'max_depth':[5, 8, 10]} # 网格搜索和交叉验证 gcv = GridSearchCV(rf, param_grid=param, cv=2) gcv.fit(x_train, y_train) print('准确率:', gcv.score(x_test, y_test)) print('查看选择的参数模型:', gcv.best_params_)