zoukankan      html  css  js  c++  java
  • 使用Graphlab参加Kaggle比赛(2017-08-20 发布于知乎)

    之前用学生证在graphlab上申了一年的graphlab使用权(华盛顿大学机器学习课程需要)然后今天突然想到完全可以用这个东东来参加kaggle.

    下午参考了一篇教程,把notebook上面的写好了

    本文很多代码参考了turi官网的一个教程,有兴趣的同学可以去看原版 

    代码

    import graphlab as gl
    %matplotlib inline
    import matplotlib.pyplot as mpl 
    mpl.rcParams['figure.figsize']=(15.0,8.0) 
    import numpy as np

    第一步:数据探索

    导入数据

    train = graphlab.SFrame.read_csv('train.csv')

    数据探索与数据可视化

    #看看除了Survived这一列以外其他列的缺值情况
    columns = ("Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")
    not_null=[sum(1 for el in train[column] if el or el == 0)for column in columns]
    null = [len(train) - el for el in not_null]
    #数字指代第几列
    indexes = np.arange(len(columns))
    width = 0.5
    #用柱形图表示缺值情况
    not_null_bar = mpl.bar(indexes, not_null, width, color='green', edgecolor='white', alpha=0.8)#非空为绿,底色为白
    null_bar = mpl.bar(indexes, null, width, color='red', edgecolor='white', bottom=not_null, alpha=0.8)#空值为红,底色为白
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)#横轴的范围
    #柱形图标题
    mpl.title('Null values for each column', fontsize=20, weight='bold')
    #x轴单位长度
    mpl.xticks(indexes + width/2., columns, fontsize=16)
    #y轴单位长度
    mpl.yticks(np.arange(0,1200,100))
    #右上角为图例
    mpl.legend( (not_null_bar[0], null_bar[0]), ('Not Null', 'Null') )

     

    观察上图我们知道Age列有少量缺值,Cabin列有大量的缺值,于是我们需要补全Age缺值,但是把Cabin列整个忽略

    直接用Age的均值补全空值

    train = train.fillna('Age',train['Age'].mean())
    #看看除了Survived这一列以外其他列的缺值情况
    columns = ("Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")
    not_null=[sum(1 for el in train[column] if el or el == 0)for column in columns]
    null = [len(train) - el for el in not_null]
    #数字指代第几列
    indexes = np.arange(len(columns))
    width = 0.5
    #用柱形图表示缺值情况
    not_null_bar = mpl.bar(indexes, not_null, width, color='green', edgecolor='white', alpha=0.8)#非空为绿,底色为白
    null_bar = mpl.bar(indexes, null, width, color='red', edgecolor='white', bottom=not_null, alpha=0.8)#空值为红,底色为白
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)#横轴的范围
    #柱形图标题
    mpl.title('Null values for each column', fontsize=20, weight='bold')
    #x轴单位长度
    mpl.xticks(indexes + width/2., columns, fontsize=16)
    #y轴单位长度
    mpl.yticks(np.arange(0,1200,100))
    #右上角为图例
    mpl.legend( (not_null_bar[0], null_bar[0]), ('Not Null', 'Null') )

     

    我们看看Pclass与生存率的关系

    passenger_class = train["Pclass"].astype(str)
    #观察每个Pclass的存活率
    #用groupby方法
    class_distribution = train.groupby(["Pclass", "Survived"], {'count':gl.aggregate.COUNT()})
    #用0和1过滤出生存和死亡
    survived = class_distribution.filter_by(1,'Survived').sort("Pclass")
    died = class_distribution.filter_by(0,'Survived').sort("Pclass")
    
    width = 0.5 
    #柱形图的参数
    survived_bar = mpl.bar(survived["Pclass"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(died["Pclass"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by Passengers Class', fontsize=20, weight='bold')
    mpl.xticks(survived["Pclass"] + width/2., survived["Pclass"], fontsize=16)
    mpl.xlim(0.5,4)
    mpl.yticks(np.arange(0,600,50))
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    for ind in np.arange(len(survived)):
        ind = int(ind)
        x = 1 + ind + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')

    由此可见,Pclass的存活率从1到3逐次下降

    我们看看性别与生存率的关系

    sex_distribution = train.groupby(["Sex", "Survived"], {'count':gl.aggregate.COUNT()})
    
    survived = sex_distribution.filter_by(1,'Survived').sort("Sex")
    died = sex_distribution.filter_by(0,'Survived').sort("Sex")
    
    indexes = np.arange(len(survived["Sex"]))
    
    
    width = 0.5 
    
    survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by Sex', fontsize=20, weight='bold')
    survived["Sex"] = [sex.capitalize() for sex in survived["Sex"]]
    mpl.xticks(indexes + width/2., survived["Sex"], fontsize=16)
    mpl.xlim(-0.5,2)
    mpl.yticks(np.arange(0,700, 50))
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    for ind in indexes:
        ind = int(ind)
        x = ind + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
    mpl.show()

    我们看看年龄与生存率的关系

    为了更加直观的体现成人与小孩的区别,我再增加一个"Categorized_Age"列

    我们使用apply方法来对每个元素进行作用,小于18岁称为小孩,其余均为大人。

    #增加列,18以下称为child
    train['Categorized_Age'] = train['Age'].apply(lambda x: "Child" if x <= 18 else "Adult")
    #用groupby方法把二者关联
    age_distribution = train.groupby(["Categorized_Age", "Survived"], {'count':gl.aggregate.COUNT()}).dropna()
    #过滤数据
    survived = age_distribution.filter_by(1,'Survived').sort("Categorized_Age")
    died = age_distribution.filter_by(0,'Survived').sort("Categorized_Age")
    #柱形图参数设置
    indexes = np.arange(len(survived["Categorized_Age"]))
    
    
    width = 0.5 
    
    survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by Age Categorization', fontsize=20, weight='bold')
    survived["Categorized_Age"] = [sex.capitalize() for sex in survived["Categorized_Age"]]
    mpl.xticks(indexes + width/2., survived["Categorized_Age"], fontsize=16)
    mpl.xlim(-0.5,2)
    mpl.yticks(np.arange(0,700, 50))
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    for ind in indexes:
        ind = int(ind)
        x = ind + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
    
    
    mpl.show()

     

    由上图可知,未成年人的存活率远大于成人

    我们看看家眷人数与生存率的关系

    下面的代码算出了家眷人数与生存率的关系。第一个for循环(line 6)是画图需要,遍历分组完生存率的各个家庭,若某个规模的所有家庭没有人生存,还是要加上一列。事实上,bar方法(line 12,13) 希望在每一个家庭规模都要对应的生存率,但是有5或者8个家眷的家庭都gg了。因此,我们用append方法 (line 8) 增加了两列,生存率记为0。

    sibling_spouses = train["SibSp"].astype(str)
    sibsp_distribution = train.groupby(["SibSp", "Survived"], {'count':gl.aggregate.COUNT()}).sort(["SibSp"])
    
    survived = sibsp_distribution.filter_by(1,"Survived")
    died = sibsp_distribution.filter_by(0,"Survived")
    
    for sibsp in sibsp_distribution["SibSp"]:
        if not survived.filter_by(sibsp, "SibSp"):
            survived = survived.append(gl.SFrame({'SibSp': [sibsp], 'Survived': [1], 'count':[0]}))
    
    width = 0.5 
    
    survived_bar = mpl.bar(survived["SibSp"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(died["SibSp"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by SibSp', fontsize=20, weight='bold')
    mpl.xticks(survived["SibSp"] + width/2., survived["SibSp"], fontsize=16)
    mpl.xlim(-0.5,9)
    mpl.yticks(np.arange(0,750,50))
    mpl.xlabel("SibSp", fontsize=16)
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    for ind in np.arange(len(survived)):
        ind = int(ind)
        x = survived["SibSp"][ind] + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
    
    mpl.show()

     

     

    由上图可知,有一个配偶的家庭生存率最高,三口之家次之,接下来才是单身狗,而家眷超过三人生存希望渺茫.

    我们看看有没有孩子与生存率的关系

    parents_children = train["Parch"].astype(str)
    parch_distribution = train.groupby(["Parch", "Survived"], {'count':gl.aggregate.COUNT()})
    
    survived = parch_distribution.filter_by(1,"Survived")
    died = parch_distribution.filter_by(0,"Survived")
    
    for parch in parch_distribution["Parch"]:
        if not survived.filter_by(parch, "Parch"):
            survived = survived.append(gl.SFrame({'Parch': [parch], 'Survived': [1], 'count':[0]}))
    
    survived = survived.sort("Parch")
    died = died.sort("Parch")
    
    width = 0.5 
    
    survived_bar = mpl.bar(survived["Parch"], survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(died["Parch"], died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by Parch', fontsize=20, weight='bold')
    mpl.xticks(survived["Parch"] + width/2., survived["Parch"], fontsize=16)
    mpl.xlim(-0.5,7)
    mpl.yticks(np.arange(0,800,50))
    mpl.xlabel("Parch", fontsize=16)
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    for ind in np.arange(len(survived)):
        ind = int(ind)
        x = survived["Parch"][ind] + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
    
    mpl.show()

     

     

    我们看看船费与生存率的关系(有钱人可能有特权

    fare = train["Fare"]
    survived = train.filter_by(1,'Survived')["Fare"]
    died = train.filter_by(0,'Survived')["Fare"]
    
    data_to_plot = [died, survived]
    
    bp = mpl.boxplot(data_to_plot,patch_artist=True, vert=0)
    
    ## change outline color, fill color and linewidth of the boxes
    for box in bp['boxes']:
        # change outline color
        box.set( color='#7570b3', linewidth=2)
        # change fill color
        box.set( facecolor = '#1b9e77' )
    
    ## change color and linewidth of the whiskers
    for whisker in bp['whiskers']:
        whisker.set(color='#7570b3', linewidth=2)
    
    ## change color and linewidth of the caps
    for cap in bp['caps']:
        cap.set(color='#7570b3', linewidth=2)
    
    ## change color and linewidth of the medians
    for median in bp['medians']:
        median.set(color='#b2df8a', linewidth=2)
    
    ## change the style of fliers and their fill
    for flier in bp['fliers']:
        flier.set(marker='o', color='#e7298a', alpha=0.5)
    
    
    mpl.yticks([1,2],['Died', 'Survived'], fontsize=20)
    mpl.xticks(np.arange(0,700, 20))
    mpl.xlim(-10,515)
    mpl.title("Survived by Fare", fontsize=20, weight='bold')
    mpl.show()

     

    这个图是反着看的,活下来的人跟死去的人花的船费对比。活下来的人普遍花了较多的船费,均值在35刀。而死去的人花费均值才几美刀。(注意有个花500多刀的真·土豪

    我们看看上船渡口与生存率的关系

    port = train["Embarked"].apply(
                                        lambda el: el + " (S = Southampton)" if el == "S" 
                                            else ( el + " (C = Cherbourg)" if el == "C" 
                                             else (el + " (Q = Queenstown)" if el == "Q" else None))) 
    port.tail(1) # force the lambda to materialize before .show() is processed
    port.show()
    
    embarked_distribution = train.groupby(["Embarked", "Survived"], {'count':gl.aggregate.COUNT()}).dropna()
    
    survived = embarked_distribution.filter_by(1,'Survived').sort("Embarked")
    survived = survived[1:]
    died = embarked_distribution.filter_by(0,'Survived').sort("Embarked")
    
    indexes = np.arange(len(survived["Embarked"]))
    
    width = 0.5 
    
    survived_bar = mpl.bar(indexes, survived["count"], width, color='green', edgecolor='white', alpha=0.8)
    died_bar = mpl.bar(indexes, died["count"], width, color='red', edgecolor='white', bottom=survived["count"], alpha=0.8)
    mpl.xlim( indexes[0] - 0.5, indexes[-1] + 1)
    
    mpl.title('Survived by Port of Embarkation', fontsize=20)
    labels = [ el + "
    (S = Southampton)" if el == "S" else ( el + "
    (C = Cherbourg)" if el == "C" else el + "
    (Q = Queenstown)") for el in survived["Embarked"]] 
    mpl.xticks(np.arange(len(survived["Embarked"])) + width/2.,labels, fontsize=16)
    
    
    for ind in indexes:
        ind = int(ind)
        x = ind + width / 2.
        y = survived["count"][ind] + died["count"][ind] + 10 
        percentage = survived["count"][ind] / float( survived["count"][ind] + died["count"][ind]) * 100
        mpl.text(x, y, "%5.2f%%" % percentage, fontsize=20, ha='center')
    
    
    mpl.legend( (survived_bar[0], died_bar[0]), ('Survived', 'Died') )
    
    mpl.show()

    所以Cherbourg上船的人存活率巨高……我个人不太明白为什么

    第二步:模型构建

    在Embarked列中有一些缺值,我们补全一下

    train["Embarked"] = train["Embarked"].apply(lambda x: x if x != '' else "S")
    port_of_embarkation = train["Embarked"]
    port_of_embarkation.tail(1)
    port_of_embarkation.show()

    在训练集中再取80%来训练模型,20%来验证模型。

    train_set, test_set = train.random_split(0.8, seed=4)
    print "Rows for training:", train_set.num_rows()
    print "Rows for testing:", test_set.num_rows()

    试一下 gradient boosted tree 这个模型

    model_4 = gl.boosted_trees_regression.create(train_set,target='Survived', 
                                      features=['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Embarked', 'Fare'])
    result_4 = model_4.evaluate(test_set)
    
    print result_4
    
    下面是训练过程
    PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
              You can set ``validation_set=None`` to disable validation tracking.
    
    Boosted trees regression:
    --------------------------------------------------------
    Number of examples          : 663
    Number of features          : 7
    Number of unpacked features : 7
    +-----------+--------------+--------------------+----------------------+---------------+-----------------+
    | Iteration | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
    +-----------+--------------+--------------------+----------------------+---------------+-----------------+
    | 1         | 0.094067     | 0.640132           | 0.640625             | 0.413718      | 0.452089        |
    | 2         | 0.095066     | 0.736341           | 0.741699             | 0.361147      | 0.435120        |
    | 3         | 0.097067     | 0.799792           | 0.795940             | 0.326181      | 0.414205        |
    | 4         | 0.098068     | 0.843834           | 0.853179             | 0.300373      | 0.418672        |
    | 5         | 0.099068     | 0.866550           | 0.875894             | 0.284084      | 0.414071        |
    | 6         | 0.100069     | 0.886572           | 0.895917             | 0.268531      | 0.401603        |
    +-----------+--------------+--------------------+----------------------+---------------+-----------------+
    {'max_error': 0.9767722487449646, 'rmse': 0.3790493668897309}

    三、导入测试集进行预测

    test = graphlab.SFrame.read_csv('test.csv')
    model_4.predict(test)
    dtype: float
    Rows: 418
    [0.24605900049209595, 0.1579868197441101, 0.09492728114128113, 0.08076220750808716, 0.820347249507904, 0.13742545247077942, 0.46745458245277405, 0.08334535360336304, 0.6385629177093506, 0.053301453590393066, 0.7933655977249146, 0.10734456777572632, 0.9794546365737915, 0.0696893036365509, 0.9803991913795471, 0.9651352167129517, 0.08926722407341003, 0.32400867342948914, 0.8363758325576782, 0.1579868197441101, 0.4781973361968994, 0.6420668363571167, 0.4161583185195923, 0.28341546654701233, 0.9170076847076416, 0.0696893036365509, 0.9794546365737915, 0.17743894457817078, 0.5841416120529175, 0.7112432718276978, 0.0696893036365509, 0.09834089875221252, 0.7118383646011353, 0.36395323276519775, 0.47720423340797424, 0.2933708429336548, 0.4699748754501343, 0.16753268241882324, 0.0941736102104187, 0.5083406567573547, 0.2918650507926941, 0.7348397970199585, 0.10613331198692322, 0.9710206985473633, 0.9803991913795471, 0.14958679676055908, 0.42003297805786133, 0.5664023756980896, 0.9672679901123047, 0.7332731485366821, 0.5267215967178345, 0.1717779040336609, 0.9495010375976562, 0.9067643880844116, 0.8308284282684326, 0.05739110708236694, 0.08792659640312195, 0.11708483099937439, 0.8308284282684326, 0.9750292301177979, 0.06759494543075562, 0.13685157895088196, 0.10684752464294434, 0.7940642237663269, 0.1582772135734558, 0.7426018714904785, 0.7501979470252991, 0.1021573543548584, 0.2818759083747864, 0.8806270360946655, 0.7940642237663269, 0.06759494543075562, 0.7951251268386841, 0.2818759083747864, 0.9750292301177979, 0.28711044788360596, 0.8174170255661011, 0.9488879442214966, 0.13685157895088196, 0.7940642237663269, 0.893699049949646, 0.04857367277145386, 0.20609065890312195, 0.7933655977249146, 0.6543059349060059, 0.8308284282684326, 0.9047337770462036, 0.16753268241882324, 0.8481748104095459, 0.9108253717422485, 0.5572522878646851, 0.7125066518783569, 0.35652855038642883, 0.8174170255661011, 0.28670477867126465, 0.28864753246307373, 0.9726588726043701, 0.16057392954826355, 0.70356285572052, 0.1119779646396637, ... ]
  • 相关阅读:
    ASP.NET(C#)图片加文字、图片水印
    CMake构建Visual Studio中MFC项目的Unicode问题
    用Visual Studio 2008(VS)编译WebKit的r63513
    此时学习中
    ASP.NET进阶——初学者的提高(长期)
    继续努力
    程序员阿士顿的故事
    iOS 深拷贝和浅拷贝
    Javascript中this的取值
    Lisp的本质(The Nature of Lisp)
  • 原文地址:https://www.cnblogs.com/earsonlau/p/11360846.html
Copyright © 2011-2022 走看看