import pandas as pd import numpy as np import random as rnd import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier train_df = pd.read_csv('C:\Users\Liubotao\Desktop\train.csv') test_df = pd.read_csv('C:\Users\Liubotao\Desktop\test.csv') combine = [train_df, test_df] print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape) train_df = train_df.drop(['Ticket', 'Cabin'], axis=1) test_df = test_df.drop(['Ticket', 'Cabin'], axis=1) combine = [train_df, test_df] "After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape for dataset in combine: dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+).', expand=False) pd.crosstab(train_df['Title'], train_df['Sex']) for dataset in combine: dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3 dataset['Fare'] = dataset['Fare'].astype(int) train_df = train_df.drop(['FareBand'], axis=1) X_train = train_df.drop("Survived", axis=1) Y_train = train_df["Survived"] X_test = test_df.drop("PassengerId", axis=1).copy() logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) submission = pd.DataFrame({"PassengerId": test_df["PassengerId"],"Survived": Y_pred})