zoukankan      html  css  js  c++  java
  • 吴裕雄--天生自然 PYTHON数据分析:威斯康星乳腺癌(诊断)数据分析

    # This Python 3 environment comes with many helpful analytics libraries installed
    # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
    # For example, here's several helpful packages to load in 
    
    import numpy as np # linear algebra
    import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
    import seaborn as sns # data visualization library  
    import matplotlib.pyplot as plt
    # Input data files are available in the "../input/" directory.
    # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
    import time
    from subprocess import check_output
    data = pd.read_csv('../input/data.csv')
    data.head()  # head method show only first 5 rows

    # feature names as a list
    col = data.columns       # .columns gives columns names in data 
    print(col)

    # y includes our labels and x includes our features
    y = data.diagnosis                          # M or B 
    list = ['Unnamed: 32','id','diagnosis']
    x = data.drop(list,axis = 1 )
    x.head()

    ax = sns.countplot(y,label="Count")       # M = 212, B = 357
    B, M = y.value_counts()
    print('Number of Benign: ',B)
    print('Number of Malignant : ',M)

    x.describe()

    # first ten features
    data_dia = y
    data = x
    data_n_2 = (data - data.mean()) / (data.std())              # standardization
    data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")
    plt.xticks(rotation=90)

    # Second ten features
    data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")
    plt.xticks(rotation=90)

    # Second ten features
    data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")
    plt.xticks(rotation=90)

    # As an alternative of violin plot, box plot can be used
    # box plots are also useful in terms of seeing outliers
    # I do not visualize all features with box plot
    # In order to show you lets have an example of box plot
    # If you want, you can visualize other features as well.
    plt.figure(figsize=(10,10))
    sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
    plt.xticks(rotation=90)

    sns.jointplot(x.loc[:,'concavity_worst'], x.loc[:,'concave points_worst'], kind="regg", color="#ce1414")

    sns.set(style="white")
    df = x.loc[:,['radius_worst','perimeter_worst','area_worst']]
    g = sns.PairGrid(df, diag_sharey=False)
    g.map_lower(sns.kdeplot, cmap="Blues_d")
    g.map_upper(plt.scatter)
    g.map_diag(sns.kdeplot, lw=3)

    sns.set(style="whitegrid", palette="muted")
    data_dia = y
    data = x
    data_n_2 = (data - data.mean()) / (data.std())              # standardization
    data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    tic = time.time()
    sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
    
    plt.xticks(rotation=90)

    data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
    plt.xticks(rotation=90)

    data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
    data = pd.melt(data,id_vars="diagnosis",
                        var_name="features",
                        value_name='value')
    plt.figure(figsize=(10,10))
    sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
    toc = time.time()
    plt.xticks(rotation=90)
    print("swarm plot time: ", toc-tic ," s")

    #correlation map
    f,ax = plt.subplots(figsize=(18, 18))
    sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

  • 相关阅读:
    Hibernate_条件查询客户列表
    Hibernate_添加联系人练习
    Linux目录的切换
    CentOS6.5在VMware中安装
    一个关于vue+mysql+express的全栈项目(三)------ 登录注册功能的实现(已经密码安全的设计)
    一个关于vue+mysql+express的全栈项目(二)------ 前端构建
    基于vue实现模糊匹配(这里以邮箱模糊匹配为例,其他的模糊匹配都可以类比)
    一个关于vue+mysql+express的全栈项目(一)
    关于Google浏览器Unable to preventDefault inside passive event listener due to target being treated as passive.的解决方案
    在移动端H5开发中(关于安卓端position:fixed和position:absolute;和虚拟键盘冲突的问题,以及解决方案)
  • 原文地址:https://www.cnblogs.com/tszr/p/11233958.html
Copyright © 2011-2022 走看看