zoukankan      html  css  js  c++  java
  • python全栈闯关--pandas

    1、导入

    import pandas as pd
    import numpy as np

    2、数据结构

    1、Series

    s = pd.Series([1, 2, 3, 4, 5, np.nan, 6, 7])
    print(s)
    # 0    1.0
    # 1    2.0
    # 2    3.0
    # 3    4.0
    # 4    5.0
    # 5    NaN
    # 6    6.0
    # 7    7.0
    # dtype: float64

    2、DataFrame

    dates = pd.date_range('20190101', periods=6)
    # index行名,columns列名
    df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
    print('列选取'.center(50, '-'))
    print(df)
    #                    a         b         c         d
    # 2019-01-01 -1.294464  0.706790 -0.164825 -0.237432
    # 2019-01-02 -1.091822  0.824446  0.748465 -0.191267
    # 2019-01-03 -0.755218  1.637604 -1.896371 -0.093815
    # 2019-01-04 -2.610031 -0.705783 -1.247235 -1.398978
    # 2019-01-05 -0.324550  1.014212  1.375527 -0.409117
    # 2019-01-06 -0.512911  0.301417  1.227190  0.771551
    
    # 按照列选择
    print('列选取'.center(50, '-'))
    print(df['c'])
    # 2019-01-01   -0.164825
    # 2019-01-02    0.748465
    # 2019-01-03   -1.896371
    # 2019-01-04   -1.247235
    # 2019-01-05    1.375527
    # 2019-01-06    1.227190
    # Freq: D, Name: c, dtype: float64

    3、创建特定数据的DataFrame

    df_1 = pd.DataFrame({
        'A': 1,  # 直接赋值,如果index数量大于1,默认此列按照这个值重复
        'B': pd.Timestamp('20190930'),  # 直接赋值Timestamp格式,多行,重复这个值
        'C': pd.Series(2, index=list(range(4)), dtype='float'),  # Series结构使用index,定义出整个函数的值
        'D': pd.Categorical([1, 2, 3, 4]),
        'E': ['a', 'b', 'c', 'd'],
        'F': 'beer',
        'G': [1, 5, 4, 4]
    })
    print(df_1)
    #    A          B    C  D  E     F  G
    # 0  1 2019-09-30  2.0  1  a  beer  1
    # 1  1 2019-09-30  2.0  2  b  beer  5
    # 2  1 2019-09-30  2.0  3  c  beer  4
    # 3  1 2019-09-30  2.0  4  d  beer  4

    4、DataFrame常用属性及排序

    print('types'.center(50, '-'))
    print(df_1.dtypes)  # 按列列出每列的数据类型
    # A             int64
    # B    datetime64[ns]
    # C           float64
    # D          category
    # E            object
    # F            object
    # G             int64
    # dtype: object
    
    print('index'.center(50, '-'))
    print(df_1.index, type(df_1.index))  # Int64Index([0, 1, 2, 3], dtype='int64')
    # Int64Index([0, 1, 2, 3], dtype='int64') <class 'pandas.core.indexes.numeric.Int64Index'>
    
    print('columns'.center(50, '-'))
    print(df_1.columns, type(df_1.columns))  # Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
    # Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object') <class 'pandas.core.indexes.base.Index'>
    
    print('values'.center(50, '-'))
    print(df_1.values, type(df_1.values))  # 打印值,结果未numpy结构
    # [[1 Timestamp('2019-09-30 00:00:00') 2.0 1 'a' 'beer' 1]
    #  [1 Timestamp('2019-09-30 00:00:00') 2.0 2 'b' 'beer' 5]
    #  [1 Timestamp('2019-09-30 00:00:00') 2.0 3 'c' 'beer' 4]
    #  [1 Timestamp('2019-09-30 00:00:00') 2.0 4 'd' 'beer' 4]] <class 'numpy.ndarray'>
    
    
    print('describe'.center(50, '-'))
    print(df_1.describe())  # 打印值,结果为numpy结构
    #          A    C         G
    # count  4.0  4.0  4.000000  计数
    # mean   1.0  2.0  2.500000  平均数
    # std    0.0  0.0  1.290994  方差
    # min    1.0  2.0  1.000000  最小值
    # 25%    1.0  2.0  1.750000  第一分位数
    # 50%    1.0  2.0  2.500000  第二分位数
    # 75%    1.0  2.0  3.250000  第三分位数
    # max    1.0  2.0  4.000000  最大值
    
    print('数据翻转'.center(50, '-'))
    print(df_1.T)  # 打印值,结果未numpy结构
    #                0  ...                    3
    # # A                    1  ...                    1
    # # B  2019-09-30 00:00:00  ...  2019-09-30 00:00:00
    # # C                    2  ...                    2
    # # D                    1  ...                    4
    # # E                    a  ...                    d
    # # F                 beer  ...                 beer
    # # G                    1  ...                    4
    #
    # [7 rows x 4 columns]
    
    print('数据按照列排序'.center(50, '-'))
    # axis=1,按照行进行排序
    # axis=0,按照列进行排序
    # ascending=False倒序输出,True为正序输出
    print('按照index排序'.center(50, '-'))
    print(df_1.sort_index(axis=0, ascending=False))
    #    A          B    C  D  E     F  G
    # 3  1 2019-09-30  2.0  4  d  beer  4
    # 2  1 2019-09-30  2.0  3  c  beer  4
    # 1  1 2019-09-30  2.0  2  b  beer  5
    # 0  1 2019-09-30  2.0  1  a  beer  1
    
    print('G列排序'.center(50, '-'))
    print(df_1.sort_values(by='G', ascending=False))  # 按照一列值排序
    #    A          B    C  D  E     F  G
    # 1  1 2019-09-30  2.0  2  b  beer  5
    # 2  1 2019-09-30  2.0  3  c  beer  4
    # 3  1 2019-09-30  2.0  4  d  beer  4
    # 0  1 2019-09-30  2.0  1  a  beer  1
    
    print('G,D列排序'.center(50, '-'))
    print(df_1.sort_values(by=['G', 'D'], ascending=False))  # 按照多列值排序
    #    A          B    C  D  E     F  G
    # 1  1 2019-09-30  2.0  2  b  beer  5
    # 3  1 2019-09-30  2.0  4  d  beer  4
    # 2  1 2019-09-30  2.0  3  c  beer  4
    # 0  1 2019-09-30  2.0  1  a  beer  1
    
    print('数据按照行排序'.center(50, '-'))
    index = list(range(4))
    col = ['A', 'B', "C", 'D', 'E']
    d_sort = pd.DataFrame(np.arange(20).reshape(4, 5), index=index, columns=col)
    # print(d_sort)
    print(d_sort.sort_values(by=[1, 2], axis=1, ascending=False))  # 按照行值排序
    #     E   D   C   B   A
    # 0   4   3   2   1   0
    # 1   9   8   7   6   5
    # 2  14  13  12  11  10
    # 3  19  18  17  16  15

    5、选择数据

    index = ['A', 'B', "C", 'D', 'E']
    dates = pd.date_range('20191001', periods=10)
    df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=index)
    # print(df)
    print('选择某列'.center(50, '-'))
    print(df['A'])
    # 2019-10-01   -0.595401
    # 2019-10-02    1.264714
    # 2019-10-03    1.179423
    # 2019-10-04   -0.516471
    # 2019-10-05    0.891850
    # 2019-10-06   -0.011205
    # 2019-10-07   -0.206089
    # 2019-10-08    0.972745
    # 2019-10-09   -0.135309
    # 2019-10-10    1.590818
    # Freq: D, Name: A, dtype: float64
    
    print('切片选择'.center(50, '-'))
    print(df[0:3])  # 按照行数切片
    print(df['2019-10-02':'2019-10-05'])  # 按照索引值进行切片
    #              A         B         C         D         E
    # 2019-10-01 -0.595401  0.337930  0.034220  1.472752 -0.555414
    # 2019-10-02  1.264714  0.518856 -1.148349  1.674159 -0.473919
    # 2019-10-03  1.179423  2.036095 -0.719042  1.607909  2.659472
    #                    A         B         C         D         E
    # 2019-10-02  1.264714  0.518856 -1.148349  1.674159 -0.473919
    # 2019-10-03  1.179423  2.036095 -0.719042  1.607909  2.659472
    # 2019-10-04 -0.516471  1.733509 -0.177231  0.260795 -0.106666
    # 2019-10-05  0.891850  0.665301  0.013627 -1.346193  0.222099
    
    # 按照行切片[0:3]值切片到了0-2行
    # 按照值'2019-10-02':'2019-10-05'切片,切到完整的日期范围

    print('按照行精确选择'.center(50, '-')) print(df.loc['2019-10-02', ['A', 'B']]) # 按照行精确选择列 # A 1.264714 # B 0.518856 # Name: 2019-10-02 00:00:00, dtype: float64 print('行号选择数据'.center(50, '-')) print(df.iloc[3, 1]) # 1.7335085248615345
    # 行数从0开始计数
    print(df.iloc[3:5, 0:2]) # 输出4到到5行的数据,1到2列的数据 # A B # 2019-10-04 -0.516471 1.733509 # 2019-10-05 0.891850 0.665301 # 切片从0开始计数,顾头部顾尾 print('混合选择'.center(50, '-')) print(df.ix[0:3, ['B', 'C']]) # B C # 2019-10-01 0.337930 0.034220 # 2019-10-02 0.518856 -1.148349 # 2019-10-03 2.036095 -0.719042 print('条件选择'.center(50, '-')) print(df[df.A > 0]) # A B C D E # 2019-10-01 0.391314 0.647378 0.065032 -0.436882 -0.482698 # 2019-10-02 1.742555 0.374014 0.737914 1.708461 0.328336 # 2019-10-03 0.024506 -0.455824 -0.397145 1.523103 1.361226 # 2019-10-04 0.140041 -0.604164 -0.397656 -0.423711 -0.626598 # 2019-10-05 0.027898 0.159293 -1.000558 0.921370 -1.613052 # 2019-10-08 1.411249 -1.292006 0.140944 0.699647 -0.065080 # 2019-10-10 0.306495 0.590515 -0.524972 0.521179 -0.805736
  • 相关阅读:
    clean code
    jenkins
    获取目录下的文件名称
    bootstrap-select 下拉互斥
    supervisord
    正则表达式
    Docker
    git
    goland工具
    小程序 swiper 轮播图滚动图片 + 视频
  • 原文地址:https://www.cnblogs.com/zxw-xxcsl/p/11662923.html
Copyright © 2011-2022 走看看