1.原始交叉验证
# Import the linear regression class from sklearn.linear_model import LinearRegression # Sklearn also has a helper that makes it easy to do cross validation from sklearn.cross_validation import KFold import numpy as np # The columns we'll use to predict the target predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # Initialize our algorithm class alg = LinearRegression() # Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test. # We set random_state to ensure we get the same splits every time we run this. kf = KFold(titanic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds. train_predictors = (titanic[predictors].iloc[train,:]) # The target we're using to train the algorithm. train_target = titanic["Survived"].iloc[train] # Training the algorithm using the predictors and target. alg.fit(train_predictors, train_target) # We can now make predictions on the test fold test_predictions = alg.predict(titanic[predictors].iloc[test,:]) predictions.append(test_predictions) # The predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis. predictions = np.concatenate(predictions, axis=0) # Map predictions to outcomes (only possible outcomes are 1 and 0) predictions[predictions > .5] = 1 predictions[predictions <=.5] = 0 accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions) print accuracy
0.783389450056
2.cross_validation交叉验证
from sklearn import cross_validation from sklearn.linear_model import LogisticRegression # Initialize our algorithm alg = LogisticRegression(random_state=1) # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3) # Take the mean of the scores (because we have one for each fold) print(scores.mean())
0.787878787879
from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # Initialize our algorithm with the default paramters # n_estimators is the number of trees we want to make # min_samples_split is the minimum number of rows we need to make a split # min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree) alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!) kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean())
0.785634118967
3. 多模型投票分类
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"] algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]] ] full_predictions = [] for alg, predictors in algorithms: # Fit the algorithm using the full training data. alg.fit(titanic[predictors], titanic["Survived"]) # Predict using the test dataset. We have to convert all the columns to floats to avoid an error. predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1] full_predictions.append(predictions) # The gradient boosting classifier generates better predictions, so we weight it higher. predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4 predictions
交叉验证中混合模型分类
from sklearn.ensemble import GradientBoostingClassifier import numpy as np # The algorithms we want to ensemble. # We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier. algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]], [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]] ] # Initialize the cross validation folds kf = KFold(titanic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: train_target = titanic["Survived"].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each fold for alg, predictors in algorithms: # Fit the algorithm on the training data. alg.fit(titanic[predictors].iloc[train,:], train_target) # Select and predict on the test fold. # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error. test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme -- just average the predictions to get the final classification. test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction. test_predictions[test_predictions <= .5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions) # Put all the predictions together into one array. predictions = np.concatenate(predictions, axis=0) # Compute accuracy by comparing to the training data. accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions) print(accuracy)