目录
读取数据
索引与计算
数据处理——kaggle泰坦尼克号
读取数据 |
a.csv
name,age,height
Tom0,15,156.3
Tom1,17,162.6
Tom2,12,165.7
Tom3,15,134.3
Tom4,27,174.5
Tom5,56,176.6
Tom6,21,172.5
Tom7,53,172.1
# -*- coding: utf-8 -*- import pandas as pd a = pd.read_csv("a.csv") print(type(a)) # <class 'pandas.core.frame.DataFrame'> print(a.dtypes) # name object # age int64 # height float64 # dtype: object print(a) # name age height # 0 Tom0 15 156.3 # 1 Tom1 17 162.6 # 2 Tom2 12 165.7 # 3 Tom3 15 134.3 # 4 Tom4 27 174.5 # 5 Tom5 56 176.6 # 6 Tom6 21 172.5 # 7 Tom7 53 172.1 print(a.head(2)) # name age height # 0 Tom0 15 156.3 # 1 Tom1 17 162.6 print(a.tail(2)) # name age height # 6 Tom6 21 172.5 # 7 Tom7 53 172.1 print(a.columns) # Index(['name', 'age', 'height'], dtype='object') print(a.shape) # (8, 3)
索引与计算 |
# -*- coding: utf-8 -*- import pandas as pd a = pd.read_csv("a.csv") print(a.loc[0]) #按索引取数据 # name Tom0 # age 15 # height 156.3 # Name: 0, dtype: object #数据切片 print(a.loc[2:4]) # name age height # 2 Tom2 12 165.7 # 3 Tom3 15 134.3 # 4 Tom4 27 174.5 #打印某列 print(a["name"]) # 0 Tom0 # 1 Tom1 # 2 Tom2 # 3 Tom3 # 4 Tom4 # 5 Tom5 # 6 Tom6 # 7 Tom7 print(a[["name","age"]]) # name age # 0 Tom0 15 # 1 Tom1 17 # 2 Tom2 12 # 3 Tom3 15 # 4 Tom4 27 # 5 Tom5 56 # 6 Tom6 21 # 7 Tom7 53 #获取列明列表 col_lst = a.columns.tolist() print(col_lst) # ['name', 'age', 'height'] #四则运算 print(a["height"]/100) # 0 1.563 # 1 1.626 # 2 1.657 # 3 1.343 # 4 1.745 # 5 1.766 # 6 1.725 # 7 1.721 # Name: height, dtype: float64 #增广DataFrame print(a) # name age height # 0 Tom0 15 156.3 # 1 Tom1 17 162.6 # 2 Tom2 12 165.7 # 3 Tom3 15 134.3 # 4 Tom4 27 174.5 # 5 Tom5 56 176.6 # 6 Tom6 21 172.5 # 7 Tom7 53 172.1 print(a.shape) # (8, 3) t = a["height"]/100 a["height(m)"] = t print(a) # name age height height(m) # 0 Tom0 15 156.3 1.563 # 1 Tom1 17 162.6 1.626 # 2 Tom2 12 165.7 1.657 # 3 Tom3 15 134.3 1.343 # 4 Tom4 27 174.5 1.745 # 5 Tom5 56 176.6 1.766 # 6 Tom6 21 172.5 1.725 # 7 Tom7 53 172.1 1.721 print(a.shape) # (8, 4) #寻找最大值 print(a["height"].max()) # 176.6
数据处理——kaggle泰坦尼克号 |
# -*- coding: utf-8 -*- import pandas as pd import numpy as np a = pd.read_csv("titanic_train.csv") #排序 b = a.sort_values("Age",ascending=True) print(b.head()) # PassengerId Survived Pclass Name Sex # 803 804 1 3 Thomas, Master. Assad Alexander male # 755 756 1 2 Hamalainen, Master. Viljo male # 644 645 1 3 Baclini, Miss. Eugenie female # 469 470 1 3 Baclini, Miss. Helene Barbara female # 78 79 1 2 Caldwell, Master. Alden Gates male # # Age SibSp Parch Ticket Fare Cabin Embarked # 803 0.42 0 1 2625 8.5167 NaN C # 755 0.67 1 1 250649 14.5000 NaN S # 644 0.75 2 1 2666 19.2583 NaN C # 469 0.75 2 1 2666 19.2583 NaN C # 78 0.83 0 2 248738 29.0000 NaN S age = a["Age"] print(age.head(10)) # 0 22.0 # 1 38.0 # 2 26.0 # 3 35.0 # 4 35.0 # 5 NaN # 6 54.0 # 7 2.0 # 8 27.0 # 9 14.0 # Name: Age, dtype: float64 age_is_null = pd.isnull(age) #是否是缺失值 print(age_is_null.head(10)) # 0 False # 1 False # 2 False # 3 False # 4 False # 5 True # 6 False # 7 False # 8 False # 9 False # Name: Age, dtype: bool #查询共有多少缺失值 b = age[age_is_null] #age_is_null做索引,取出age为空的样本 print(b.head()) # 5 NaN # 17 NaN # 19 NaN # 26 NaN # 28 NaN # Name: Age, dtype: float64 print(len(b)) # 177 #求年龄的平均值 c = age[age_is_null == False] print(c.sum()/len(c)) #方法一 # 29.69911764705882 print(age.mean()) #方法二 # 29.69911764705882 # 查看不同船舱的平均价格 # 方法一: levels = [1,2,3] Pclass = a["Pclass"] #船舱等级 Fare = a["Fare"] #价格 print(type(Fare)) # <class 'pandas.core.series.Series'> print(Pclass.head()) # 0 3 # 1 1 # 2 3 # 3 1 # 4 3 print(Fare.head()) # 0 7.2500 # 1 71.2833 # 2 7.9250 # 3 53.1000 # 4 8.0500 fare_dic = {} for level in levels: fare_dic[level] = Fare[Pclass == level].mean() print(fare_dic) # {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997} # 方法二: b = a.pivot_table(index = "Pclass", values = "Fare", aggfunc = np.mean) print(b) # Fare # Pclass # 1 84.154687 # 2 20.662183 # 3 13.675550 # 查看不同等级船舱的平均获救人数 b = a.pivot_table(index = "Pclass", values = "Survived", aggfunc = np.mean) print(b) # Survived # Pclass # 1 0.629630 # 2 0.472826 # 3 0.242363 # 查看不同等级船舱的平均年龄 b = a.pivot_table(index = "Pclass", values = ["Survived","Age"], aggfunc = np.mean) print(b) # Age Survived # Pclass # 1 38.233441 0.629630 # 2 29.877630 0.472826 # 3 25.140620 0.242363 # 丢弃缺失数据 print(a.shape) #(891, 12) b = a.dropna(axis=0) #丢弃掉0轴上有缺失的 print(b.shape) # (183, 12) b = a.dropna(axis=1) #丢弃掉1轴上有缺失的 print(b.shape) # (891, 9) b = a.dropna(axis=0, subset=["Age","Embarked"]) #丢弃掉0轴上,"Age","Embarked"有缺失的 print(b.shape) # (712, 12) #定位 print(a.head(1)) #查看第1个人的年龄 print(a.loc[0,"Age"]) #查看第1个人的年龄 # PassengerId Survived Pclass Name Sex Age SibSp # 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 # # Parch Ticket Fare Cabin Embarked # 0 0 A/5 21171 7.25 NaN S # 22.0