zoukankan      html  css  js  c++  java
  • kaggle_Titanic

    # -*- coding: utf-8 -*-
    """
    Created on Mon Oct  9 14:05:41 2017
    
    @author: lenovo
    """
    
    import numpy as np
    import pandas as pd
    
    #载入数据,合并测试集和训练集做特征处理
    data_train = pd.read_csv('./input/train.csv')
    data_train['id'] = 'train'
    data_test = pd.read_csv('./input/test.csv')
    data_test['id'] = 'test'
    data = pd.concat((data_train,data_test),axis=0)
    #计算各属性的缺失值
    for column in data.columns:
        print(column,data[column].isnull().sum())
    
    #填充fare数据
    fare_mean = data[data['Fare']>0].groupby('Pclass').mean()['Fare'] #查看各个船舱的价格均值
    #用价格均值填充缺失价格和为0价格
    for i in range(0,3):
        data.loc[(data.Fare.isnull()) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
        data.loc[(data.Fare==0) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
    #处理年龄缺失值,用随机森林建模做预测
    data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    age_exist = data_for_age[data_for_age['Age'].notnull()]
    age_null = data_for_age[data_for_age['Age'].isnull()]
    y = age_exist.values[:,0]
    x = age_exist.values[:,1:]
    x_test = age_null.values[:,1:]
    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=200,max_depth=5)
    rf.fit(x,y)
    y_pred = rf.predict(x_test)
    data.loc[(data.Age.isnull()),'Age'] = y_pred
    #处理性别字段,无缺失值直接转成0,1格式
    data['Sex'] = data['Sex'].map({'female':0,'male':1})
    #将Sibsp、Pclass字段one_hot
    SibSp = pd.get_dummies(data['SibSp'],prefix='SibSp')
    Pclass = pd.get_dummies(data['Pclass'],prefix='Pclass')
    Parch = pd.get_dummies(data['Parch'],prefix='Parch')
    #处理Embarked缺失值,直接众数填充
    data['Embarked'].fillna('S',inplace=True)
    Embarked = pd.get_dummies(data['Embarked'],prefix='Embarked')
    #处理Cabin值,缺失直接就当做没有u0
    data[data['Cabin'].isnull()]['Cabin'] = 'u0'
    Cabin = pd.get_dummies(data['Cabin'],prefix='Cabin')
    #全部数据合并
    data.drop(['SibSp','Pclass','Parch','Embarked','Cabin','Name','Ticket','PassengerId'],axis=1,inplace=True)
    data_all = pd.concat([data,SibSp,Pclass,Parch,Embarked],axis=1)
    
    #建模做预测
    data_train = data_all[data_all['id']=='train']
    data_train.drop('id',axis=1,inplace=True)
    data_test = data_all[data_all['id']=='test']
    data_test.drop(['Survived','id'],axis=1,inplace=True)
    x = data_train.drop('Survived',axis=1).values[:,:]
    y = data_train.loc[:,'Survived']
    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn import metrics
    
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0)
    sc = StandardScaler()
    x_train_std = sc.fit_transform(x_train)
    x_test_std = sc.transform(x_test)
    
    #logistics 回归
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train_std,y_train)
    y_pred_lr = lr.predict(x_test_std)
    print('Logistic Regression:',metrics.accuracy_score(y_test,y_pred_lr))
    #Logistic Regression: 0.809701492537
    
    #决策树
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import GridSearchCV
    dt = DecisionTreeClassifier()
    model_dt = GridSearchCV(dt,param_grid={'max_depth':range(1,10)},cv=5)
    model_dt.fit(x_train_std,y_train)
    y_pred_dt = model_dt.predict(x_test_std)
    print('Decision Tree:',metrics.accuracy_score(y_test,y_pred_dt))
    #Decision Tree: 0.813432835821
    
    #随机森林
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(max_depth=4,n_estimators=200)
    rf.fit(x_train_std,y_train)
    y_pred_rf = rf.predict(x_test_std)
    y_pred_rf1 = rf.predict(data_test_xgb)
    print('RandomForest:',metrics.accuracy_score(y_test,y_pred_rf))
    #RandomForest: 0.817164179104
    
    
    #svm
    from sklearn.svm import SVC
    svc = SVC(kernel='rbf',decision_function_shape='ovo')
    model_svc = GridSearchCV(svc,param_grid={'C':np.arange(5,10)/10,'gamma':range(10,101,10)},cv=5)
    model_svc.fit(x_train_std,y_train)
    y_pred_svc = model_svc.predict(x_test_std)
    
    print('SVM:',metrics.accuracy_score(y_test,y_pred_svc))  
    #SVM: 0.723880597015
    
    #xgboost
    import xgboost as xgb
    xgb_train = xgb.DMatrix(x_train_std,label=y_train)
    xgb_test = xgb.DMatrix(x_test_std,label=y_test)
    param = {'max_depth':4,'eta':0.3,'silent':1,'object':'binary:logistic'}
    watchlist = [(xgb_train,'train'),(xgb_test,'test')]
    def error_rate(y_hat,y):
        return 'error',float(sum(y.get_label()!=(y_hat>0.5)))/len(y_hat)
    bst = xgb.train(param,xgb_train,evals=watchlist,num_boost_round=4,feval=error_rate)
    y_pred_xgb = bst.predict(xgb_test)
    print('xgb:',np.average(y_test == (y_pred_xgb>0.5)))  
    #XGB: 0.832089552239
    
    
    #用xgb来做预测
    data_test_xgb = sc.transform(data_test)
    xgb_test = xgb.DMatrix(data_test_xgb)
    y_pred_xgb1 = bst.predict(xgb_test)
    y_pred_xgb1[y_pred_xgb1>0.5]=1
    y_pred_xgb1[y_pred_xgb1<0.5]=0
    
    #输出到文件
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_xgb1
    test.to_csv('xgb.csv')
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_rf1
    test.to_csv('rf.csv')
  • 相关阅读:
    改造vant日期选择
    css3元素垂直居中
    npm综合
    (转)网页加水印方法
    Mac下IDEA自带MAVEN插件的全局环境配置
    隐藏注册控件窗口
    High performance optimization and acceleration for randomWalk, deepwalk, node2vec (Python)
    How to add conda env into jupyter notebook installed by pip
    The Power of WordNet and How to Use It in Python
    背单词app测评,2018年
  • 原文地址:https://www.cnblogs.com/jiegege/p/7641838.html
Copyright © 2011-2022 走看看