# Generating a familysize column titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"] # The .apply method generates a new series titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))
import re # A function to get the title from a name. def get_title(name): # Use a regular expression to search for a title. # Titles always consist of capital and lowercase letters, and end with a period. title_search = re.search(' ([A-Za-z]+).', name) # If the title exists, extract and return it. if title_search: return title_search.group(1) return "" # Get all the titles and print how often each one occurs. titles = titanic["Name"].apply(get_title) print(pandas.value_counts(titles)) # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles. title_mapping = { "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 } for k, v in title_mapping.items(): titles[titles == k] = v # Verify that we converted everything. # 验证我们是否转换了所有内容 print(pandas.value_counts(titles)) # Add in the title column. titanic["Title"] = titles
Mr 517 Miss 182 Mrs 125 Master 40 Dr 7 Rev 6 Major 2 Mlle 2 Col 2 Sir 1 Mme 1 Lady 1 Countess 1 Capt 1 Ms 1 Don 1 Jonkheer 1 Name: Name, dtype: int64 1 517 2 183 3 125 4 40 5 7 6 6 7 5 10 3 8 3 9 2 Name: Name, dtype: int64
import numpy as np from sklearn.feature_selection import SelectKBest, f_classif # 选择最好特征 import matplotlib.pyplot as plt predictors = [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength" ] # Perform feature selection # 执行特征选择 selector = SelectKBest(f_classif, k=5) selector.fit(titanic[predictors], titanic["Survived"]) # Get the raw p-values for each feature, and transform from p-values into scores scores = -np.log10(selector.pvalues_) # Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best? plt.bar(range(len(predictors)), scores) plt.xticks(range(len(predictors)), predictors, rotation='vertical') plt.show() # Pick only the four best features. # 只选择4个最好的特征 predictors = ["Pclass", "Sex", "Fare", "Title"] alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
from sklearn.ensemble import GradientBoostingClassifier import numpy as np # The algorithms we want to ensemble. # We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier. algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]], [LogisticRegression(random_state=1,solver='liblinear'), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]] ] # Initialize the cross validation folds kf = KFold(n_splits=3,shuffle=False, random_state=1) predictions = [] for train, test in kf.split(titanic): train_target = titanic["Survived"].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each fold for alg, predictors in algorithms: # Fit the algorithm on the training data. alg.fit(titanic[predictors].iloc[train,:], train_target) # Select and predict on the test fold. # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error. test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme -- just average the predictions to get the final classification. test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 # 两个分类器的平均结果 # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction. test_predictions[test_predictions <= .5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions) # Put all the predictions together into one array. # 将所有的预测放在一个数组中 predictions = np.concatenate(predictions, axis=0) # Compute accuracy by comparing to the training data. accuracy = sum(predictions == titanic["Survived"]) / len(predictions) print(accuracy)
titles = titanic_test["Name"].apply(get_title) # We're adding the Dona title to the mapping, because it's in the test set, but not the training set title_mapping = { "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona": 10 } for k, v in title_mapping.items(): titles[titles == k] = v titanic_test["Title"] = titles # Check the counts of each unique title. print(pandas.value_counts(titanic_test["Title"])) # Now, we add the family size column. titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]
1 240
2 79
3 72
4 21
7 2
6 2
10 1
5 1
Name: Title, dtype: int64
predictors = [ "Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title" ] algorithms = [ [ GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors ], [ LogisticRegression(random_state=1, solver='liblinear'), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"] ] ] full_predictions = [] for alg, predictors in algorithms: # Fit the algorithm using the full training data. alg.fit(titanic[predictors], titanic["Survived"]) # Predict using the test dataset. We have to convert all the columns to floats to avoid an error. predictions = alg.predict_proba( titanic_test[predictors].astype(float))[:, 1] predictions[predictions <= .5] = 0 predictions[predictions > .5] = 1 full_predictions.append(predictions) # The gradient boosting classifier generates better predictions, so we weight it higher. # predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4 predictions
array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0.,
0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1.,
1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
1., 1., 1., 1., 1., 0., 1., 0., 0., 0.])