zoukankan      html  css  js  c++  java
  • 数据特征分析-分布分析

    分布分析用于研究数据的分布特征,常用分析方法:

    1、极差

    2、频率分布

    3、分组组距及组数

    df = pd.DataFrame({'编码':['001','002','003','004','005','006','007','008','009','010','011','012','013','014','015'],
                       '小区':['A村','B村','C村','D村','E村','A村','B村','C村','D村','E村','A村','B村','C村','D村','E村'],
                       '朝向':['south','east_north','south','east_south','eath_south','north','east_north','west_north','south','west','north','east_north','south','south','east'],
                       '单价':[7374,6435,6643,5874,6738,6453,5733,6034,5276,5999,6438,5864,6099,5699,6999],
                       '首付':[15,7.5,18,10,30,10,18,30,40,30,20,22,29,30,40],
                       '总价':[50,65,68,73,80,55,45,70,59,57,40,60,50,48,60],
                       '经度':[114.0,114.6,114.8,114.2,114.5,114.3,114.4,114.7,114.9,114.1,114.8,114.2,114.5,114.3,114.8],
                       '纬度':[22.0,22.4,22.6,22.8,22.2,22.1,22.7,22.5,22.9,22.3,22.8,22.2,22.1,22.7,22.5]    
                        }) 

    先对总体做关于经纬度的散点图

    plt.scatter(df['经度'],df['纬度'],s = df['单价']/50,c = df['总价'],cmap='Greens')   #原点的大小可以表示单价,越大单价越高;颜色深浅可以表示总价,越深总价越高

     求总价、单价和首付的极差

    def d_range(df,*cols):
        krange = []
        for c in cols:
            crange = df[c].max() - df[c].min()
            krange.append(crange)
        return ('%s极差:%s
    %s极差:%s
    %s极差:%s'%(cols[0],krange[0],cols[1],krange[1],cols[2],krange[2]))
    print(d_range(df,'总价','单价','首付'))
    # 总价极差:40
    # 单价极差:2098
    # 首付极差:32.5

    单价和总价的频率分布

    fig,axes = plt.subplots(1,2,figsize = (10,4))
    df['单价'].hist(bins = 8,ax = axes[0])
    df['总价'].hist(bins = 8,ax = axes[1])

    将总价分为8个区间,求出每个区间的频数、频率,并求出累计频率

    # 频率分布,分组区间
    total_range = pd.cut(df['总价'],8)   #通过cut将总价分为8个区间
    total_range_count = total_range.value_counts(sort=False)   #求每个区间的个数,结果为一个Seris,不按列的大小排序
    total_range_s = pd.DataFrame(total_range_count)  #将Seris转化为DataFrame,生成一个用于统计总价的DataFrame
    # # total_range_s.rename(columns = {total_range_count.name:'频数',inplace = True})
    total_range_s.columns = ['频数']  #给转化后的DataFrame重命名列
    df['区间'] = total_range.values  #给原数据加一列区间
    total_range_s['频率'] = total_range_s['频数']/total_range_s['频数'].sum()  #求总价在每个区间出现的频率
    total_range_s['累计频率'] = total_range_s['频率'].cumsum()   ##求总价在每个区间的累计频率
    total_range_s['频率%'] = total_range_s['频率'].apply(lambda x:'%.2f%%'%(100*x))  #格式化频率列,显示为2位百分数
    total_range_s['累计频率%'] = total_range_s['累计频率'].apply(lambda x:'%.2f%%'%(100*x))#格式化频率列,显示为2位百分数
    total_range_s.style.bar(subset = ['频率','累计频率'])

     对每个总价区间出现的频率做柱状图

    total_range_s['频率'].plot(kind = 'bar',alpha = 0.8,title ='total price interval')
    x = range(len(total_range_s.index))
    for i,j,k in zip(x,total_range_s['频率'],total_range_s['频数']):
        plt.text(i,j+0.01,k)

     

    对于单个字段比如朝向,做频率统计分析

    # 频率分布 定性字段
    cx = df['朝向'].value_counts()
    cx_s = pd.DataFrame(cx)
    cx_s.columns = ['频数']
    cx_s['频率'] = cx_s['频数']/cx_s['频数'].sum()
    cx_s['累计频率'] = cx_s['频率'].cumsum()
    cx_s['频率%'] = cx_s['频率'].apply(lambda x:'%.2f%%'%(100*x))
    cx_s['累计频率%'] = cx_s['累计频率'].apply(lambda x:'%.2f%%'%(100*x))
    cx_s.style.bar(subset = ['频率','累计频率'] )

     对朝向做柱状图和饼图

    fig,axes = plt.subplots(1,2,figsize = (10,4))
    cx_s['频率'].plot(kind = 'bar',ax = axes[0],title = 'direction bar')   
        
    plt.pie(cx_s['频数'],labels=cx_s.index,autopct='%2.f%%')
    plt.title('direction pie')

  • 相关阅读:
    HDU 2236 无题Ⅱ
    Golden Tiger Claw(二分图)
    HDU 5969 最大的位或 (思维,贪心)
    HDU 3686 Traffic Real Time Query System (图论)
    SCOI 2016 萌萌哒
    Spring Boot支持控制台Banner定制
    构建第一个Spring Boot程序
    Spring Boot重要模块
    Java fastjson JSON和String互相转换
    BCompare 4 Windows激活方法【试用期30天重置】
  • 原文地址:https://www.cnblogs.com/Forever77/p/11344050.html
Copyright © 2011-2022 走看看