zoukankan      html  css  js  c++  java
  • Python 探索性数据分析(Exploratory Data Analysis,EDA)

    探索性数据分析,主要针对原始数据进行初次了解。了解数据的分布情况、了解分析方向、排除该单个变量的异常值 等。此脚本读取的是 SQL Server ,只需给定表名或视图名称,如果有数据,将输出每个字段符合要求的每张数据分布图。

    # -*- coding: UTF-8 -*-
    # python 3.5.0
    # 探索性数据分析(Exploratory Data Analysis,EDA)
    __author__ = 'HZC'
    
    import math
    import sqlalchemy
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    class EDA:
        def __init__(self,d):
            self.engine = sqlalchemy.create_engine("mssql+pymssql://%s:%s@%s/%s" %(d['user'],d['pwd'],d['ins'],d['db']))
            
        def get_df_from_table(self,table_name):
            df = pd.read_sql_table(table_name, self.engine)
            return df
        
        def get_df_from_query(self,sql):
            df = pd.read_sql_query(sql, self.engine)
            return df
        
        #读取表各字段数据类型
        def get_table_type(self,table_name):
            sql = """select c.name as colname,t.name as typename
            from sys.sysobjects o inner join syscolumns c on o.id=c.id and o.name<>'dtproperties' 
            inner join sys.systypes t on c.xusertype=t.xusertype 
            where o.name='%s'""" % table_name
            df = self.get_df_from_query(sql)
            return df
        
        #绘图
        def eda_plot(self,table_name):
            list_char = ['char','nchar','varchar','nvarchar','text','ntext','sysname']
            list_num = ['tinyint','smallint','int','real','money','float','decimal','numeric','smallmoney','bigint']
            df_type = self.get_table_type(table_name)
            df_date = self.get_df_from_table(table_name)
            date_count = df_date.shape[0]
            k = 0
            for row in df_type.itertuples():
                k = k + 1
                #字符类型,绘柱状图
                if row.typename in list_char:
                    col = df_date.groupby([row.colname]).agg({row.colname:['count']})
                    row_count = col.shape[0]
                    #col_count = col.shape[1]
                    col = col.sort_index()
                    val = col.values.tolist()
                    #只绘不重复数占总数比小于 5% 的
                    if math.floor(row_count*100/date_count) <5:
                        df_ = pd.DataFrame(col.index.values.tolist(), columns=[row.colname])
                        df_['count'] = list(i[0] for i in val)
                        x_axle = range(len(df_[row.colname]))
                        y_axle = df_['count'].tolist()
                        x_label = df_[row.colname].tolist()
                        fig, (ax1, ax2) = plt.subplots(2)
                        ax1.bar(x_axle,y_axle)
                        ax1.set_xticks(x_axle)
                        ax1.set_xticklabels(x_label)
                        ax1.set_title('表[%s]  %s  分布' % (table_name,row.colname))
                        ax2.pie(y_axle,labels=x_label, autopct='%1.2f%%')
                        
                #数值类型,其他分布图    
                elif row.typename in list_num:
                    df__ = pd.DataFrame(df_date[row.colname])
                    df__ = df__[(df__[row.colname].notnull())].sort_values(row.colname, ascending=True).reset_index(drop=True)
                    k = k + 1
                    plt.figure(k)
                    plt.subplot(1,3,1)
                    plt.hist(df__[row.colname])
                    plt.subplot(1,3,2)
                    plt.boxplot(df__[row.colname])
                    plt.gca().set_title('表[%s]  %s  分布' % (table_name,row.colname))
                    plt.subplot(1,3,3)
                    plt.violinplot(df__[row.colname])
                    plt.tight_layout()
                else:
                    pass
            plt.show()
    
    if __name__ == "__main__":
        #conn = {'user':'kk','pwd':'kk','ins':'HYH0109-189\CAT2014','db':'CSMS3'} 
        conn = {'user':'用户名','pwd':'密码','ins':'实例','db':'数据库'} 
        eda = EDA(conn)
        eda.eda_plot("表或视图名")

    显示图分为字符型(离散型)和数值型(连续型),示例结果如下:

     

  • 相关阅读:
    founder面试题
    项目bug的修正
    Linux下分割、合并PDF(pdftk),用于Linux系统的6款最佳PDF页面裁剪工具
    Vim global命令和重复操作
    嵌入式linux GUI--DirectFB + GTK至尊秘笈
    让QT/Embedded支持国际化
    开篇-QT完全手册
    java多线程样例
    Windows Minifilter驱动
    poj 3735 大数量反复操作问题(矩阵高速幂)
  • 原文地址:https://www.cnblogs.com/hzc2012/p/8276350.html
Copyright © 2011-2022 走看看