zoukankan      html  css  js  c++  java
  • pandas习题100道

    #1
    import pandas as pd
    #2
    pd.__version__
    
    '1.0.5'
    
    #3
    pd.show_versions()
    
    INSTALLED VERSIONS
    ------------------
    commit           : None
    python           : 3.8.3.final.0
    python-bits      : 64
    OS               : Darwin
    OS-release       : 19.6.0
    machine          : x86_64
    processor        : i386
    byteorder        : little
    LC_ALL           : None
    LANG             : zh_CN.UTF-8
    LOCALE           : zh_CN.UTF-8
    
    pandas           : 1.0.5
    numpy            : 1.18.5
    pytz             : 2020.1
    dateutil         : 2.8.1
    pip              : 20.1.1
    setuptools       : 49.2.0.post20200714
    Cython           : 0.29.21
    pytest           : 5.4.3
    hypothesis       : None
    sphinx           : 3.1.2
    blosc            : None
    feather          : None
    xlsxwriter       : 1.2.9
    lxml.etree       : 4.5.2
    html5lib         : 1.1
    pymysql          : 0.10.1
    psycopg2         : None
    jinja2           : 2.11.2
    IPython          : 7.16.1
    pandas_datareader: None
    bs4              : 4.9.1
    bottleneck       : 1.3.2
    fastparquet      : None
    gcsfs            : None
    lxml.etree       : 4.5.2
    matplotlib       : 3.2.2
    numexpr          : 2.7.1
    odfpy            : None
    openpyxl         : 3.0.4
    pandas_gbq       : None
    pyarrow          : None
    pytables         : None
    pytest           : 5.4.3
    pyxlsb           : None
    s3fs             : None
    scipy            : 1.5.0
    sqlalchemy       : 1.3.18
    tables           : 3.6.1
    tabulate         : None
    xarray           : None
    xlrd             : 1.2.0
    xlwt             : 1.3.0
    xlsxwriter       : 1.2.9
    numba            : 0.50.1
    
    #4创建一个DataFrame(df),用data做数据,labels做行索引
    import numpy as np
    data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
          'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
          'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
          'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
    labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
    
    df = pd.DataFrame(data,index=labels)
    df
    
    animal age visits priority
    a cat 2.5 1 yes
    b cat 3.0 3 yes
    c snake 0.5 2 no
    d dog NaN 3 yes
    e dog 5.0 2 no
    f cat 2.0 3 no
    g snake 4.5 1 no
    h cat NaN 1 yes
    i dog 7.0 2 no
    j dog 3.0 1 no
    #5显示有关此df及其数据的基本信息的摘要
    df.info()
    df.describe()
    
    <class 'pandas.core.frame.DataFrame'>
    Index: 10 entries, a to j
    Data columns (total 4 columns):
     #   Column    Non-Null Count  Dtype  
    ---  ------    --------------  -----  
     0   animal    10 non-null     object 
     1   age       8 non-null      float64
     2   visits    10 non-null     int64  
     3   priority  10 non-null     object 
    dtypes: float64(1), int64(1), object(2)
    memory usage: 400.0+ bytes
    
    age visits
    count 8.000000 10.000000
    mean 3.437500 1.900000
    std 2.007797 0.875595
    min 0.500000 1.000000
    25% 2.375000 1.000000
    50% 3.000000 2.000000
    75% 4.625000 2.750000
    max 7.000000 3.000000
    #6查看此df的前三行数据
    df.head(3)
    
    animal age visits priority
    a cat 2.5 1 yes
    b cat 3.0 3 yes
    c snake 0.5 2 no
    #7选择df中列标签为animal和age的数据
    df1 = df[['animal','age']]
    df1
    
    animal age
    a cat 2.5
    b cat 3.0
    c snake 0.5
    d dog NaN
    e dog 5.0
    f cat 2.0
    g snake 4.5
    h cat NaN
    i dog 7.0
    j dog 3.0
    #8选择行为[3, 4, 8],且列为['animal', 'age']中的数据
    #本题的难点在于当行索引被命名为非数字时不能再使用loc按照数字取值,如果使用iloc就不能用名称取列,而本题的要求是按数字取行,按名称取列
    #解法一:分两次取值
    df2 = df.iloc[[3,4,8],:][['animal', 'age']]
    #解法二:取行索引然后按索引取
    df2 = df.loc[df.index[[3,4,8]],['animal', 'age']]
    #错误用法:
    # df2 = df.loc[[3,4,8],['animal', 'age']]
    df2
    
    animal age
    d dog NaN
    e dog 5.0
    i dog 7.0
    #9选择visuts大于2的动物种类
    df
    bool_s=df.visits>2
    df[bool_s]['animal']
    
    b    cat
    d    dog
    f    cat
    Name: animal, dtype: object
    
    #10选择age为缺失值的行
    df[df.age.isnull()]
    df[df['age'].isnull()]
    
    animal age visits priority
    d dog NaN 3 yes
    h cat NaN 1 yes
    #11选择animal为cat,且age小于3的行
    df[(df['animal']=='cat') & (df['age']<3)]
    
    animal age visits priority
    a cat 2.5 1 yes
    f cat 2.0 3 no
    #12选择age在2到4之间的数据(包含边界值)
    df[(df['age']>=2) &(df['age']<=4)]
    
    animal age visits priority
    a cat 2.5 1 yes
    b cat 3.0 3 yes
    f cat 2.0 3 no
    j dog 3.0 1 no
    #13将f行的age改为1.5
    # df.loc['f']['age']=1.5 这个操作不规范会警报
    df.loc['f','age']=1.5
    
    #14计算visits列的数据总和
    res = df['visits'].sum()
    res
    
    19
    
    #15计算每种animal的平均age
    df.groupby('animal')['age'].mean()
    
    animal
    cat      2.333333
    dog      5.000000
    snake    2.500000
    Name: age, dtype: float64
    
    #16追加一行(k),列的数据自定义,然后再删除新追加的k行
    
    df.loc['k']=df.loc['a'].values
    df.drop('k',inplace=True) #删除行
    # del df['k'] #删除列
    df
    
    animal age visits priority
    a cat 2.5 1 yes
    b cat 3.0 3 yes
    c snake 0.5 2 no
    d dog NaN 3 yes
    e dog 5.0 2 no
    f cat 2.0 3 no
    g snake 4.5 1 no
    h cat NaN 1 yes
    i dog 7.0 2 no
    j dog 3.0 1 no
    #17计算每种animal的个数(cat有几个,dog几个...)
    df.groupby('animal').size()
    df['animal'].value_counts()
    # df['animal'].unique() #查看动物的种类
    # df['animal'].nunique() #查看动物种类的个数
    
    dog      4
    cat      4
    snake    2
    Name: animal, dtype: int64
    
    #18先根据age降序排列,再根据visits升序排列
    df.sort_values(by=['age','visits'],ascending=[False,True])
    
    animal age visits priority
    i dog 7.0 2 no
    e dog 5.0 2 no
    g snake 4.5 1 no
    j dog 3.0 1 no
    b cat 3.0 3 yes
    a cat 2.5 1 yes
    f cat 2.0 3 no
    c snake 0.5 2 no
    h cat NaN 1 yes
    d dog NaN 3 yes
    #19将priority列的yes和no用True和False替换
    #这题似乎有点问题,不同类型的数据类型似乎不能相互转换
    df['priority'] = df['priority'].astype(bool).map({'yes': True, 'no': False})
    df
    
    animal age visits priority
    a cat 2.5 1 NaN
    b cat 3.0 3 NaN
    c snake 0.5 2 NaN
    d dog NaN 3 NaN
    e dog 5.0 2 NaN
    f cat 2.0 3 NaN
    g snake 4.5 1 NaN
    h cat NaN 1 NaN
    i dog 7.0 2 NaN
    j dog 3.0 1 NaN
    #20 将animal列的snake用python替换
    df['animal'].replace('snake','python',inplace=True)
    df
    
    animal age visits priority
    a cat 2.5 1 NaN
    b cat 3.0 3 NaN
    c python 0.5 2 NaN
    d dog NaN 3 NaN
    e dog 5.0 2 NaN
    f cat 2.0 3 NaN
    g python 4.5 1 NaN
    h cat NaN 1 NaN
    i dog 7.0 2 NaN
    j dog 3.0 1 NaN
    #21对于每种动物类型和每种访问次数,求出平均年龄。换句话说,每一行都是动物,每一列都是访问次数,其值是平均年龄(提示:使用数据透视表)
    # df.groupby(['animal','visits'])['age'].mean().reset_index()
    df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')
    
    visits 1 2 3
    animal
    cat 2.5 NaN 2.5
    dog 3.0 6.0 NaN
    python 4.5 0.5 NaN
  • 相关阅读:
    skymvc文件上传支持多文件上传
    skymvc网站测试之mysql数据生成
    欢迎使用skymvc框架,简单易用的php框架
    模式识别与机器学习(第四章学习记录和心得)
    python实现MICD分类器
    python实现MED分类器
    模式识别与机器学习(第一至三章学习记录和心得)
    软工实践个人总结
    第09组 每周小结 (3/3)
    第09组 每周小结 (2/3)
  • 原文地址:https://www.cnblogs.com/Franciszw/p/14470400.html
Copyright © 2011-2022 走看看