zoukankan      html  css  js  c++  java
  • kaggle_Titanic

    # -*- coding: utf-8 -*-
    """
    Created on Mon Oct  9 14:05:41 2017
    
    @author: lenovo
    """
    
    import numpy as np
    import pandas as pd
    
    #载入数据,合并测试集和训练集做特征处理
    data_train = pd.read_csv('./input/train.csv')
    data_train['id'] = 'train'
    data_test = pd.read_csv('./input/test.csv')
    data_test['id'] = 'test'
    data = pd.concat((data_train,data_test),axis=0)
    #计算各属性的缺失值
    for column in data.columns:
        print(column,data[column].isnull().sum())
    
    #填充fare数据
    fare_mean = data[data['Fare']>0].groupby('Pclass').mean()['Fare'] #查看各个船舱的价格均值
    #用价格均值填充缺失价格和为0价格
    for i in range(0,3):
        data.loc[(data.Fare.isnull()) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
        data.loc[(data.Fare==0) & (data.Pclass==i+1),'Fare'] = fare_mean[i+1]
    #处理年龄缺失值,用随机森林建模做预测
    data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    age_exist = data_for_age[data_for_age['Age'].notnull()]
    age_null = data_for_age[data_for_age['Age'].isnull()]
    y = age_exist.values[:,0]
    x = age_exist.values[:,1:]
    x_test = age_null.values[:,1:]
    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=200,max_depth=5)
    rf.fit(x,y)
    y_pred = rf.predict(x_test)
    data.loc[(data.Age.isnull()),'Age'] = y_pred
    #处理性别字段,无缺失值直接转成0,1格式
    data['Sex'] = data['Sex'].map({'female':0,'male':1})
    #将Sibsp、Pclass字段one_hot
    SibSp = pd.get_dummies(data['SibSp'],prefix='SibSp')
    Pclass = pd.get_dummies(data['Pclass'],prefix='Pclass')
    Parch = pd.get_dummies(data['Parch'],prefix='Parch')
    #处理Embarked缺失值,直接众数填充
    data['Embarked'].fillna('S',inplace=True)
    Embarked = pd.get_dummies(data['Embarked'],prefix='Embarked')
    #处理Cabin值,缺失直接就当做没有u0
    data[data['Cabin'].isnull()]['Cabin'] = 'u0'
    Cabin = pd.get_dummies(data['Cabin'],prefix='Cabin')
    #全部数据合并
    data.drop(['SibSp','Pclass','Parch','Embarked','Cabin','Name','Ticket','PassengerId'],axis=1,inplace=True)
    data_all = pd.concat([data,SibSp,Pclass,Parch,Embarked],axis=1)
    
    #建模做预测
    data_train = data_all[data_all['id']=='train']
    data_train.drop('id',axis=1,inplace=True)
    data_test = data_all[data_all['id']=='test']
    data_test.drop(['Survived','id'],axis=1,inplace=True)
    x = data_train.drop('Survived',axis=1).values[:,:]
    y = data_train.loc[:,'Survived']
    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn import metrics
    
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0)
    sc = StandardScaler()
    x_train_std = sc.fit_transform(x_train)
    x_test_std = sc.transform(x_test)
    
    #logistics 回归
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train_std,y_train)
    y_pred_lr = lr.predict(x_test_std)
    print('Logistic Regression:',metrics.accuracy_score(y_test,y_pred_lr))
    #Logistic Regression: 0.809701492537
    
    #决策树
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import GridSearchCV
    dt = DecisionTreeClassifier()
    model_dt = GridSearchCV(dt,param_grid={'max_depth':range(1,10)},cv=5)
    model_dt.fit(x_train_std,y_train)
    y_pred_dt = model_dt.predict(x_test_std)
    print('Decision Tree:',metrics.accuracy_score(y_test,y_pred_dt))
    #Decision Tree: 0.813432835821
    
    #随机森林
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(max_depth=4,n_estimators=200)
    rf.fit(x_train_std,y_train)
    y_pred_rf = rf.predict(x_test_std)
    y_pred_rf1 = rf.predict(data_test_xgb)
    print('RandomForest:',metrics.accuracy_score(y_test,y_pred_rf))
    #RandomForest: 0.817164179104
    
    
    #svm
    from sklearn.svm import SVC
    svc = SVC(kernel='rbf',decision_function_shape='ovo')
    model_svc = GridSearchCV(svc,param_grid={'C':np.arange(5,10)/10,'gamma':range(10,101,10)},cv=5)
    model_svc.fit(x_train_std,y_train)
    y_pred_svc = model_svc.predict(x_test_std)
    
    print('SVM:',metrics.accuracy_score(y_test,y_pred_svc))  
    #SVM: 0.723880597015
    
    #xgboost
    import xgboost as xgb
    xgb_train = xgb.DMatrix(x_train_std,label=y_train)
    xgb_test = xgb.DMatrix(x_test_std,label=y_test)
    param = {'max_depth':4,'eta':0.3,'silent':1,'object':'binary:logistic'}
    watchlist = [(xgb_train,'train'),(xgb_test,'test')]
    def error_rate(y_hat,y):
        return 'error',float(sum(y.get_label()!=(y_hat>0.5)))/len(y_hat)
    bst = xgb.train(param,xgb_train,evals=watchlist,num_boost_round=4,feval=error_rate)
    y_pred_xgb = bst.predict(xgb_test)
    print('xgb:',np.average(y_test == (y_pred_xgb>0.5)))  
    #XGB: 0.832089552239
    
    
    #用xgb来做预测
    data_test_xgb = sc.transform(data_test)
    xgb_test = xgb.DMatrix(data_test_xgb)
    y_pred_xgb1 = bst.predict(xgb_test)
    y_pred_xgb1[y_pred_xgb1>0.5]=1
    y_pred_xgb1[y_pred_xgb1<0.5]=0
    
    #输出到文件
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_xgb1
    test.to_csv('xgb.csv')
    test = pd.read_csv('predictions.csv')
    test['Survived']=y_pred_rf1
    test.to_csv('rf.csv')
  • 相关阅读:
    Shared Memory in Windows NT
    Layered Memory Management in Win32
    软件项目管理的75条建议
    Load pdbs when you need it
    Stray pointer 野指针
    About the Rebase and Bind operation in the production of software
    About "Serious Error: No RTTI Data"
    Realizing 4 GB of Address Space[MSDN]
    [bbk4397] 第1集 第一章 AMS介绍
    [bbk3204] 第67集 Chapter 17Monitoring and Detecting Lock Contention(00)
  • 原文地址:https://www.cnblogs.com/jiegege/p/7641838.html
Copyright © 2011-2022 走看看