zoukankan      html  css  js  c++  java
  • 数据分析

    基本统计:

    from pandas import read_csv;
    
    data = read_csv('D:\PA\8.1\data.csv')
    
    data.score.describe()
    
    data.score.size
    
    data.score.max();
    
    data.score.min;
    
    data.score.sum;
    
    data.score.mean;
    
    data.score.var;
    
    data.score.std;
    View Code

    分组分析:

    import numpy;
    from pandas import read_csv;
    
    data = read_csv('C:\PA\8.2\data.csv');
    data['score2'] = data['score']*2
    
    data.groupby(by=['class'])['score'].agg({
        '总分':numpy.sum, 
        '人数':numpy.size, 
        '平均值':numpy.mean, 
        '方差':numpy.var, 
        '标准差':numpy.std
    })
    
    data.groupby(by=['class', 'name'])[['score', 'score2']].agg([
        numpy.size, 
        numpy.sum
    ])
    
    result = data.groupby(by=['class'])['score'].agg({
        '总分':numpy.sum, 
        '人数':numpy.size, 
        '平均值':numpy.mean, 
        '方差':numpy.var, 
        '标准差':numpy.std
    })
    
    result.index
    result.columns
    result['平均值']
    
    result2 = data.groupby(by=['class', 'name'])[['score', 'score2']].agg([
        numpy.size, 
        numpy.sum
    ])
    
    result2.index
    result2.columns
    result2['score']
    result2['score']['sum']
    
    result.reset_index()
    result2.reset_index()
    View Code

    分布分析:

    import numpy;
    import pandas;
    from pandas import read_csv;
    
    data = read_csv('D:\PA\8.3\data.csv');
    
    bins = [min(data.年龄)-1, 20, 30, 40, max(data.年龄)+1];
    labels = ['20岁以及以下', '21岁到30岁', '31岁到40岁', '41岁以上'];
    
    年龄分层 = pandas.cut(data.年龄, bins, labels=labels)
    
    data['年龄分层'] = 年龄分层;
    
    data.groupby(by=['年龄分层'])['年龄'].agg({'人数':numpy.size})
    View Code 

    交叉分析:

    import numpy;
    import pandas;
    from pandas import read_csv;
    
    df = read_csv('D:\PA\8.4\data.csv');
    
    bins = [min(df.年龄)-1, 20, 30, 40, max(df.年龄)+1];
    labels = ['20岁以及以下', '21岁到30岁', '31岁到40岁', '41岁以上'];
    
    年龄分层 = pandas.cut(df.年龄, bins, labels=labels)
    df['年龄分层'] = 年龄分层;
    
    r1 = df.pivot_table(
        values=['年龄'], 
        index=['年龄分层'], 
        columns=['性别'], 
        aggfunc=[numpy.size, numpy.mean]
    );
    
    r2 = df.pivot_table(
        values=['年龄'], 
        index=['年龄分层'], 
        columns=['性别'], 
        aggfunc=[numpy.std]
    );
    
    r1.join(r2)
    View Code

    结构分析:

    import numpy;
    from pandas import read_csv;
    
    data = read_csv('C:\PA\8.5\data.csv');
    
    data_pt = data.pivot_table(
        values=['月消费(元)'], 
        index=['省份'], 
        columns=['通信品牌'], 
        aggfunc=[numpy.sum]
    );
    
    data_pt.sum()
    
    data_pt.sum(axis=0)
    
    data_pt.sum(axis=1)
    
    data_pt.div(data_pt.sum(axis=1), axis=0);
    
    data_pt.div(data_pt.sum(axis=0), axis=1);
    View Code

    相关分析:

    # -*- coding: utf-8 -*-
    from pandas import read_csv;
    data = read_csv('D:\PA\8.6\data.csv');
    
    #先来看看如何进行两个列之间的相关度的计算
    data['人口'].corr(data['文盲率'])
    
    #多列之间的相关度的计算方法
    #选择多列的方法
    #data.loc[:, ['列1', '列2', '……', '列n']]
    data.loc[:, ['超市购物率', '网上购物率', '文盲率', '人口']].corr()
    View Code
  • 相关阅读:
    自考新教材-p173_3(1)
    自考新教材-p148_5(2)
    自考新教材-p148_5(1)
    自考新教材-p148_4
    自考新教材-p147_3
    自考新教材-p146_4(2)
    python 模块 chardet报错解决方法:下载及介绍
    第 52 讲:论一只爬虫的自我修养
    第 51 讲: _name_属性
    Python 培训第一讲
  • 原文地址:https://www.cnblogs.com/qiuyuyu/p/9146423.html
Copyright © 2011-2022 走看看