zoukankan      html  css  js  c++  java
  • Pandas-数据分析包

    pandas是python的第三方库所以使用前需要安装一下,直接使用pip install pandas 就会自动安装pandas以及相关组件。

    1、Series模块

    1.1 Series的索引默认是从 0 开始的整数。

    from pandas import Series
    import pandas as pd
    
    s = Series([1,4,'ww','tt'])
    s.index                           # RangeIndex(start=0, stop=4, step=1)
    s.values                          # array([1, 4, 'ww', 'tt'], dtype=object)  
    s
    0     1
    1     4
    2    ww
    3    tt
    dtype: object

    1.2 Series 可以自定义索引:

    s2 = Series(['wangxing','man',24],index=['name','sex','age'])
    s2
    name    wangxing
    sex          man
    age           24
    dtype: object

    1.3 Series值引用

    s2['name']     # 'wangxing'
    s2['sex']      # 'man'
    s2['age']      # 24

    1.4 传入字典,定义Series的索引与值

    # 传入字典,定义Series的索引与值
    sd = {'python':9000,'c++':9001,'c#':9000}
    # s3 = Series(sd)
    s3 = Series({'python':9000,'c++':9001,'c#':9000})
    s3
    python    9000
    c++       9001
    c#        9000
    dtype: int64

    1.5 索引“自动对齐”:如果自定义了索引,自定的索引会自动寻找原来的索引,如果一样的,就取原来索引对应的值

       如果没有值(null),都对齐赋给 NaN

    sd = {'python':9000,'c++':9001,'c#':9000}
    # 如果没有值(null),都对齐赋给 NaN s4
    = Series(sd, index=['java','c++','c#']) s4
    java       NaN
    c++     9001.0
    c#      9000.0
    dtype: float64

    1.6 Pandas 有专门的方法来判断值是否为空   Series 对象也有同样的方法

    #pd.isnull(s4)
    s4.isnull()
    java     True
    c++     False
    c#      False
    dtype: bool

    2、DataFrame

    from pandas import Series,DataFrame
    data = {"name":['google','baidu','yahoo'],"marks":[100,200,300],"price":[1,2,3]}
    f1 = DataFrame(data)
    # 按照惯例默认索引就是从 0 开始的整数
    f1

    # DataFrame 中,columns 其顺序可以被规定
    # 且 DataFrame 数据的索引也能够自定义
    f2 = DataFrame(data,columns=['name','price','marks'], index=['a','b','c'])
    
    f2

     2.1 pandas.read_csv() 从文件中读取数据,生成DataFrame

    import pandas
    # 从文件中读取数据,生成DataFrame
    food_info=pandas.read_csv("G:\python\库应用(4个)\2-数据分析处理库pandas\food_info.csv")

    2.1.1 pd.to_datetime() 将int、float、str、datetime类型等数据转换为datetime

    import pandas as pd
    unrate = pd.read_csv('unrate.csv')
    # pd.to_datetime() 将int、float、str、datetime类型等数据转换为datetime
    unrate['DATE'] = pd.to_datetime(unrate['DATE'])
    help(pd.to_datetime)
    #unrate.head(12)

    2.2 df.head(2) 头部2行的数据

    # 打印头1行数据
    food_info.head(1)

    2.3 food_info.columns获取DataFrame的列名

    # 获取DataFrame的所有列名
    col_names = food_info.columns.tolist()
    col_names

    2.4 访问"Iron_(mg)"列的第[6]位数据    /   访问"Iron_(mg)"列的[2,6,8]位数据

    # 访问"Iron_(mg)"列的第[6]位数据
    food_info["Iron_(mg)"][6]
    # 访问"Iron_(mg)"列的[2,6,8]位数据
    food_info["Iron_(mg)"][[2,6,8]]

    2.5 DataFrame的加、减、乘、除

    # DataFrame的加、减、乘、除
    # food_info["Iron_(mg)"] / 1000
    div_1000 = food_info["Iron_(mg)"] / 1000
    add_100 = food_info["Iron_(mg)"] + 100
    sub_100 = food_info["Iron_(mg)"] - 100
    mult_2 = food_info["Iron_(mg)"] * 2
    
    water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
    # 通过蛋白质和脂肪,计算失误的评分,评分公式:Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
    weighted_protein = food_info["Protein_(g)"] * 2
    weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
    initial_rating = weighted_protein + weighted_fat
    initial_rating
    #“Vit_A_IU”列的范围从0到100000,而“Fiber_TD_(g)”列的范围从0到79
    #对于某些计算,因为值的规模,像“Vit_A_IU”这样的列对结果有更大的影响
    # "Energ_Kcal"列的最大值
    max_calories = food_info["Energ_Kcal"].max()   # 902
    
    # 标准化:"Energ_Kcal"列除以该列最大值
    normalized_calories = food_info["Energ_Kcal"] / max_calories
    normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
    normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
    
    # 新增特征:“Normalized_Protein”和“Normalized_Fat”
    food_info["Normalized_Protein"] = normalized_protein
    food_info["Normalized_Fat"] = normalized_fat

    2.6 DataFrame新增和修改列

    # DataFrame增加新column
    food_info["water_energy"] = water_energy
    # Iron单位mg转换为g
    iron_grams = food_info["Iron_(mg)"] / 1000
    # 增加列"Iron_(g)"
    food_info["Iron_(g)"] = iron_grams
    food_info["Iron_(g)"]

    2.7 df.sort_values()排序

    # 默认情况下,pandas将按我们指定的列升序排列数据,并返回一个新的DataFrame
    # inplace参数,是否自排序DataFrame,而不返回新DataFrame
    # inplace=False,非自排序,返回新DataFrame
    # a = food_info.sort_values("Sodium_(mg)",inplace=False)
    # inplace=True,自排序,不返回新DataFrame
    food_info.sort_values("Sodium_(mg)",inplace=True)
    # ascending=False,升序等于False,即降序
    food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False)  # 降序

     2.8 type()  查看数据类型

    type(food_info)          # pandas.core.frame.DataFrame
    print(type(food_info))   # <class 'pandas.core.frame.DataFrame'>
    #print (food_info.dtypes) # 所有columns的数据类型

    2.9 DataFrame行、列访问

    # df.loc[]  DataFrame行访问
    # df.loc[3:6]  访问df的[3-6行]
    food_info.loc[3:6]
    # df.loc[[3,7,9]]  访问列表[]中的行:3,7,9行
    food_info.loc[[3,7,9]]
    # df[["Zinc_(mg)", "Copper_(mg)"]]列访问
    food_info[["Zinc_(mg)", "Copper_(mg)"]]
    # 访问列表[]中的行:3,7,9行,["Shrt_Desc","Water_(g)","Energ_Kcal"]列
    #food_info.loc[[3,7,9]][["Shrt_Desc","Water_(g)","Energ_Kcal"]]
    # 访问885行Age的值
    row_index_885_age = titanic_survival.loc[885,"Age"]   # 39.0  
    row_index_885_age
    # 访问886行Pclass的值
    titanic_survival.loc[886,"Pclass"]

    2.10 DataFrame查看头3行,所有单位为(g)的列数据

    # 查看头3行,所有单位为(g)的列数据
    col_names = food_info.columns.tolist()
    gram_columns = []
    
    for c in col_names:
        if c.endswith("(g)"):
            gram_columns.append(c)
    food_info[gram_columns].head(3)
    food_info.head(1)
    food_info.loc[[0,2,4]]
    food_info.loc[0:3]

    2.11 泰坦尼克号数据试验

    2.11.1 所有age为null的值被选择出来

    # 泰坦尼克号数据试验
    import pandas as pd
    import numpy as np
    titanic_survival = pd.read_csv("titanic_train.csv")
    titanic_survival.head()
    null_indexs = pd.isnull(titanic_survival)["Age"]
    null_df = titanic_survival[null_indexs]["Age"]
    null_df
    
    # 
    age = titanic_survival["Age"]
    age.loc[0:22]
    type(age)          # pandas.core.series.Series
    age_is_null = pd.isnull(age)
    age_is_null
    type(age_is_null)   # pandas.core.series.Series
    
    # 所有age为null的值被选择出来
    age_null_true = age[age_is_null]
    age_null_true           # Name: Age, Length: 177, dtype: float64
    age_null_count = len(age_null_true)
    age_null_count          # 177

    行列访问

    # 访问885行Age的值
    row_index_885_age = titanic_survival.loc[885,"Age"]   # 39.0  
    row_index_885_age
    # 访问886行Pclass的值
    titanic_survival.loc[886,"Pclass"]

    2.11.2 python内置函数,进行数学运算时,一旦任意一个值为NaN,则结果为NaN

    # 进行数学运算时,一旦任意一个值为NaN,则结果为NaN
    mean_age = sum(titanic_survival['Age']) / len(titanic_survival['Age'])  # sum函数,返回NaN
    # pandas的sum方法自动过滤null,正常
    #mean_age = titanic_survival['Age'].sum() / len(titanic_survival['Age']) 
    mean_age
    
    # 人工筛选出非空age
    good_ages = titanic_survival['Age'][age_is_null == False]
    good_ages
    correct_mean_age = sum(good_ages) / len(good_ages)
    correct_mean_age
    
    # pandas自带的数学运算方法,可以自动过滤掉空值
    correct_mean_age = titanic_survival["Age"].mean()
    correct_mean_age

    2.11.3 泰坦尼克号之均值

    # 按类,求fare的均值
    # 数据可按“Pclass”列,分为3类:1,2,3
    passenger_classes = [1, 2, 3]
    fares_by_class = {}
    for this_class in passenger_classes:
        # 按“Pclass”依次取出1,2,3类数据
        pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
        # 取出“Fare”列的数据
        pclasses_fares = pclass_rows["Fare"]
        # pandas内部数学函数mean求均值
        fare_for_class = pclasses_fares.mean()
        # 将不同类的均值,添加进字典fares_by_class
        fares_by_class[this_class] = fare_for_class
    fares_by_class    # {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
    #help(titanic_survival.pivot_table)
    
    # Pclass三类人获救的概率均值
    passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) 
    passenger_survival
    
    # Pclass三类人年纪均值
    passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")   # aggfunc默认为mean求均值
    passenger_age
    
    # 按Embarked分组,求和Fare和Survived两列
    port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
    port_stats

    2.11.4 df.dropna()  移除缺失的值

    # df.dropna()  移除缺失的值
    # 指定axis=1或axis='columns'将删除任何具有null值的列
    # 指定axis=0或axis='index'将删除任何具有null值的行,默认0
    titanic_survival.dropna(axis=1)
    # 'Age','Sex'两列不为空的所有行
    titanic_survival.dropna(axis=0, subset=['Age','Sex'])
    #titanic_survival.dropna(axis=0)
    #help(titanic_survival.dropna)

    2.11.5 df.sort_values() 对DataFrame进行排序,按Age降序排列

    # df.sort_values() 对DataFrame进行排序,按Age降序排列
    new_titanic_survival = titanic_survival.sort_values("Age", ascending=False)
    # reset_index 重置新DataFrame的索引
    new_titanic_survival.reset_index(drop=True)  # 索引从0开始自然递增
    #help(new_titanic_survival.reset_index)

    2.11.6  DataFrame.apply( func ) 函数应用于每列或每行,执行自定义函数func

    # hundredth_row函数返回Series序列的第一百项
    def hundredth_row(column):
        #提取第一百项
        hundredth_item = column.iloc[99]
        return hundredth_item
    
    # df.apply(func)函数应用于每列或每行,执行自定义函数func。
    # 返回每一列的第一百项
    hundredth_row = titanic_survival.apply(hundredth_row)
    hundredth_row
    #help(titanic_survival.apply)
    # 每列非空总数
    def not_null_count(column):
        column_null = pd.isnull(column)
        not_null = column[column_null == False]
        return len(not_null)
    
    column_null_count = titanic_survival.apply(not_null_count)
    column_null_count
    # df.apply(func,axis=1)
    #通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。
    # 1、根据Pclass的值进行分类
    def which_class(row):
        pclass = row['Pclass']
        if pd.isnull(pclass):  # pclass为空,则返回Unknown
            return "Unknown"
        elif pclass == 1:
            return "First Class"
        elif pclass == 2:
            return "Second Class"
        elif pclass == 3:
            return "Third Class"
        
    classes = titanic_survival.apply(which_class, axis=1)
    classes
    # df.apply(func,axis=1)
    #通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。
    # 2、根据年龄判断是否成年
    def is_minor(row):
        if row["Age"] < 18:
            return True
        else: 
            return False
        
    titanic_survival.apply(is_minor, axis=1)
    
    def generate_age_label(row):
        age = row["Age"]
        if pd.isnull(age):
            return "unknown"
        elif age < 18:
            return "minor"
        else :
            return "adult"
        
    age_labels = titanic_survival.apply(generate_age_label, axis=1)
    
    # titanic_survival添加“age_labels”列
    titanic_survival['age_labels'] = age_labels
    titanic_survival.pivot_table(index="age_labels", values="Survived")

    2.12  电影评分

    2.12.1 pandas.read_csv()从文件生成DataFrame

    #FILM - film name
    #RottenTomatoes - Rotten Tomatoes 影评人的平均得分
    #RottenTomatoes_User - Rotten Tomatoes 用户平均得分
    #RT_norm - Rotten Tomatoes 影评人的平均得分(归一化为0 - 5分)
    #RT_user_norm - Rotten Tomatoes 用户平均得分(归一化到0 - 5分系统)
    #Metacritic - Metacritic 评论家的平均得分
    #Metacritic_User - Metacritic 用户的平均得分
    
    import pandas as pd
    #从csv文件中读取数据,生成DataFrame
    fandango = pd.read_csv('G:\python\库应用(4个)\3-可视化库matpltlib\fandango_scores.csv')
    #读取'FILM'列的[0~5)的值
    fandango['FILM'][0:5]
    #访问'RottenTomatoes'列的[0~5)的值
    fandango['RottenTomatoes'][0:5]

    2.13 Series的使用示例

    from pandas import Series
    # 从DataFrame中获取series:“FILM”列,得到Series
    film_series = fandango['FILM']    
    
    # series.values属性,获取所有值列表
    film_names = film_series.values         # type(film_names) 返回numpy.ndarray
    type(film_names)
    
    rt_series = fandango['RottenTomatoes']
    rt_scores = rt_series.values            # type(rt_scores) 返回numpy.ndarray
    type(rt_scores)
    
    # 构建Series,值为rt_scores,索引为film_names
    custom_series = Series(rt_scores, index=film_names)
    # 通过数字进行访问
    custom_series[[3,5,8]]
    # 通过索引名进行访问
    custom_series[['Minions (2015)', 'Leviathan (2014)']]
    
    # series.index属性,获取所有值列表
    type(custom_series.index)               # pandas.core.indexes.base.Index
    type(custom_series.index.tolist())      # list
    original_index = custom_series.index.tolist()
    # sorted(iterable)内置函数,对list进行排序
    sorted_index = sorted(original_index)
    #help(custom_series.reindex)
    # series.reindex(index_arr_like)重置series的索引
    sorted_by_index = custom_series.reindex(sorted_index)
    
    # series按索引排序sort_index、按值排序sort_values
    custom_series.sort_index()
    custom_series.sort_values()
    
    np.add(custom_series,custom_series)   # 等同于 custom_series + custom_series
    np.sin(custom_series)
    np.max(custom_series)
    
    custom_series > 98
    greater_than_98_series = custom_series[custom_series > 98]
    
    condition_one = custom_series > 60
    condition_two = custom_series < 66
    custom_series[condition_one & condition_two]
  • 相关阅读:
    fread 和 read的区别
    Windows下的linux开发环境Cygwin的安装配置
    开机出现grub提示符后怎样进入系统?
    Matlab中函数句柄的优点
    UNIX环境高级编程的apue.h源码APUE
    装了一个ubuntu10.10,打印机不能添加了,
    迅雷上如何下载热映的电影大片~~
    我的linux 初始配置安装的东东,最好保存上一份,对于经常装linux的朋友
    HDU 1875 畅通工程再续
    HDU 1874 畅通工程续
  • 原文地址:https://www.cnblogs.com/LIAOBO/p/15344076.html
Copyright © 2011-2022 走看看