zoukankan      html  css  js  c++  java
  • Pandas基础

    目录

      读取数据

      索引与计算

      数据处理——kaggle泰坦尼克号


    读取数据

    a.csv

    name,age,height
    Tom0,15,156.3
    Tom1,17,162.6
    Tom2,12,165.7
    Tom3,15,134.3
    Tom4,27,174.5
    Tom5,56,176.6
    Tom6,21,172.5
    Tom7,53,172.1

    # -*- coding: utf-8 -*-
    import pandas as pd
    
    a = pd.read_csv("a.csv")
    print(type(a))
    # <class 'pandas.core.frame.DataFrame'>
    print(a.dtypes)
    # name       object
    # age         int64
    # height    float64
    # dtype: object
    print(a)
    #    name  age  height
    # 0  Tom0   15   156.3
    # 1  Tom1   17   162.6
    # 2  Tom2   12   165.7
    # 3  Tom3   15   134.3
    # 4  Tom4   27   174.5
    # 5  Tom5   56   176.6
    # 6  Tom6   21   172.5
    # 7  Tom7   53   172.1
    print(a.head(2)) 
    #    name  age  height
    # 0  Tom0   15   156.3
    # 1  Tom1   17   162.6
    print(a.tail(2)) 
    #    name  age  height
    # 6  Tom6   21   172.5
    # 7  Tom7   53   172.1
    print(a.columns)
    # Index(['name', 'age', 'height'], dtype='object')
    print(a.shape)
    # (8, 3)

      

     返回目录

    索引与计算 

    # -*- coding: utf-8 -*-
    import pandas as pd
    
    a = pd.read_csv("a.csv")
    
    print(a.loc[0]) #按索引取数据
    # name       Tom0
    # age          15
    # height    156.3
    # Name: 0, dtype: object
    
    #数据切片
    print(a.loc[2:4]) 
    #    name  age  height
    # 2  Tom2   12   165.7
    # 3  Tom3   15   134.3
    # 4  Tom4   27   174.5
    
    #打印某列
    print(a["name"]) 
    # 0    Tom0
    # 1    Tom1
    # 2    Tom2
    # 3    Tom3
    # 4    Tom4
    # 5    Tom5
    # 6    Tom6
    # 7    Tom7
    print(a[["name","age"]]) 
    #    name  age
    # 0  Tom0   15
    # 1  Tom1   17
    # 2  Tom2   12
    # 3  Tom3   15
    # 4  Tom4   27
    # 5  Tom5   56
    # 6  Tom6   21
    # 7  Tom7   53
    
    #获取列明列表
    col_lst = a.columns.tolist()
    print(col_lst)
    # ['name', 'age', 'height']
    
    #四则运算
    print(a["height"]/100)
    # 0    1.563
    # 1    1.626
    # 2    1.657
    # 3    1.343
    # 4    1.745
    # 5    1.766
    # 6    1.725
    # 7    1.721
    # Name: height, dtype: float64
    
    #增广DataFrame
    print(a)
    #    name  age  height
    # 0  Tom0   15   156.3
    # 1  Tom1   17   162.6
    # 2  Tom2   12   165.7
    # 3  Tom3   15   134.3
    # 4  Tom4   27   174.5
    # 5  Tom5   56   176.6
    # 6  Tom6   21   172.5
    # 7  Tom7   53   172.1
    print(a.shape)
    # (8, 3)
    t = a["height"]/100
    a["height(m)"] = t
    print(a)
    #    name  age  height  height(m)
    # 0  Tom0   15   156.3      1.563
    # 1  Tom1   17   162.6      1.626
    # 2  Tom2   12   165.7      1.657
    # 3  Tom3   15   134.3      1.343
    # 4  Tom4   27   174.5      1.745
    # 5  Tom5   56   176.6      1.766
    # 6  Tom6   21   172.5      1.725
    # 7  Tom7   53   172.1      1.721
    print(a.shape)
    # (8, 4)
    
    #寻找最大值
    print(a["height"].max())
    # 176.6

     返回目录

    数据处理——kaggle泰坦尼克号 

     

    # -*- coding: utf-8 -*-
    import pandas as pd
    import numpy as np
    
    
    a = pd.read_csv("titanic_train.csv")
    #排序
    b = a.sort_values("Age",ascending=True)
    print(b.head())
    #      PassengerId  Survived  Pclass                             Name     Sex  
    # 803          804         1       3  Thomas, Master. Assad Alexander    male   
    # 755          756         1       2        Hamalainen, Master. Viljo    male   
    # 644          645         1       3           Baclini, Miss. Eugenie  female   
    # 469          470         1       3    Baclini, Miss. Helene Barbara  female   
    # 78            79         1       2    Caldwell, Master. Alden Gates    male   
    # 
    #       Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
    # 803  0.42      0      1    2625   8.5167   NaN        C  
    # 755  0.67      1      1  250649  14.5000   NaN        S  
    # 644  0.75      2      1    2666  19.2583   NaN        C  
    # 469  0.75      2      1    2666  19.2583   NaN        C  
    # 78   0.83      0      2  248738  29.0000   NaN        S  
    
    age = a["Age"]
    print(age.head(10))
    # 0     22.0
    # 1     38.0
    # 2     26.0
    # 3     35.0
    # 4     35.0
    # 5      NaN
    # 6     54.0
    # 7      2.0
    # 8     27.0
    # 9     14.0
    # Name: Age, dtype: float64
    
    age_is_null = pd.isnull(age) #是否是缺失值
    print(age_is_null.head(10))
    # 0    False
    # 1    False
    # 2    False
    # 3    False
    # 4    False
    # 5     True
    # 6    False
    # 7    False
    # 8    False
    # 9    False
    # Name: Age, dtype: bool
    
    #查询共有多少缺失值
    b = age[age_is_null] #age_is_null做索引,取出age为空的样本
    print(b.head())
    # 5    NaN
    # 17   NaN
    # 19   NaN
    # 26   NaN
    # 28   NaN
    # Name: Age, dtype: float64
    print(len(b))
    # 177
    
    #求年龄的平均值
    c = age[age_is_null == False]
    print(c.sum()/len(c)) #方法一
    # 29.69911764705882
    print(age.mean())  #方法二
    # 29.69911764705882
    
    
    # 查看不同船舱的平均价格
    # 方法一:
    levels = [1,2,3]
    Pclass = a["Pclass"] #船舱等级
    Fare = a["Fare"] #价格
    print(type(Fare))
    # <class 'pandas.core.series.Series'>
    print(Pclass.head())
    # 0    3
    # 1    1
    # 2    3
    # 3    1
    # 4    3
    print(Fare.head())
    # 0     7.2500
    # 1    71.2833
    # 2     7.9250
    # 3    53.1000
    # 4     8.0500
    fare_dic = {}
    for level in levels:
        fare_dic[level] = Fare[Pclass == level].mean()
    print(fare_dic)
    # {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
    # 方法二:
    b = a.pivot_table(index = "Pclass", values = "Fare", aggfunc = np.mean)
    print(b)
    #              Fare
    # Pclass           
    # 1       84.154687
    # 2       20.662183
    # 3       13.675550
    
    
    # 查看不同等级船舱的平均获救人数
    b = a.pivot_table(index = "Pclass", values = "Survived", aggfunc = np.mean)
    print(b)
    #         Survived
    # Pclass          
    # 1       0.629630
    # 2       0.472826
    # 3       0.242363
    
    # 查看不同等级船舱的平均年龄
    b = a.pivot_table(index = "Pclass", values = ["Survived","Age"], aggfunc = np.mean)
    print(b)
    #               Age  Survived
    # Pclass                     
    # 1       38.233441  0.629630
    # 2       29.877630  0.472826
    # 3       25.140620  0.242363
    
    # 丢弃缺失数据
    print(a.shape)
    #(891, 12)
    b = a.dropna(axis=0) #丢弃掉0轴上有缺失的
    print(b.shape)
    # (183, 12)
    b = a.dropna(axis=1) #丢弃掉1轴上有缺失的
    print(b.shape)
    # (891, 9)
    b = a.dropna(axis=0, subset=["Age","Embarked"]) #丢弃掉0轴上,"Age","Embarked"有缺失的
    print(b.shape)
    # (712, 12)
    
    #定位
    print(a.head(1)) #查看第1个人的年龄
    print(a.loc[0,"Age"]) #查看第1个人的年龄
    #    PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  
    # 0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   
    # 
    #    Parch     Ticket  Fare Cabin Embarked  
    # 0      0  A/5 21171  7.25   NaN        S  
    # 22.0

     返回目录

  • 相关阅读:
    h5及c3新增的一些内容
    Ajax实现步骤和原理
    prototype和__proto__的关系是什么?
    深拷贝与浅拷贝
    promise与async和await的区别
    ph

    p
    python4
    python3
  • 原文地址:https://www.cnblogs.com/itmorn/p/8150517.html
Copyright © 2011-2022 走看看