1.Pandas读取数据
一般错误
import pandas as pd pd.read_csv(r'D:数据分析 2_Pandaspandasfood_info.csv')
out:
--------------------------------------------------------------------------- OSError Traceback (most recent call last) <ipython-input-15-cc3e7efb5b57> in <module>() 1 import pandas as pd ----> 2 pd.read_csv(r'D:数据分析 2_Pandaspandasfood_info.csv') ... pandas\_libsparsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source() OSError: Initializing from file failed
使用read_csv()函数时,可能会报错如上。
显示以上错误,网上解答说可能是路径含有中文名,或者只能读取当前文件夹下的文件;
设置:engine=“python”解决问题!! 默认engine=“C”,使用C时速度较快,但是包含中文时出错。
数据读取
In:
# read_csv函数,读取数据 import pandas as pd food_info = pd.read_csv(r'D:数据分析 2_Pandaspandasfood_info.csv', engine='python') # print(food_info) # pandas读取文件后,数据类型为dataframe格式, # numpy读取文件后的数据类型为array个数 print(type(food_info)) # dataframe.dtypes 返回每列数据的类型;object是pandas中的字符型数据 # numpy中的dtype返回整个文件的数据类型,因此要求数据类型一致;不一致时自动类型转换 print(food_info.dtypes)
Out:
<class 'pandas.core.frame.DataFrame'> NDB_No int64 Shrt_Desc object ... FA_Mono_(g) float64 FA_Poly_(g) float64 Cholestrl_(mg) float64 dtype: object
一般属性
In:
# head(),tail()方法的使用 # print(food_info.head()) # 默认显示前5行 print(food_info.head(3)) # head()函数:读取数据的前3行 # food_info.tail(4) # tail()函数:显示末尾4行 # print(food_info.columns) # columns属性:返回所有列名 print(food_info.shape) # shape属性:显示行列数
Out:
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g)
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g)
0 81.11 2.11 0.06 0.0 0.06
1 81.11 2.11 0.06 0.0 0.06
2 99.48 0.00 0.00 0.0 0.00
...
[3 rows x 36 columns]
(8618, 36)
2.Pandas索引与计算
loc与iloc
In:
# loc与iloc索引取值 # print(food_info.loc[0]) # 通过loc方法,获取第一行数据的值 print(food_info.iloc[0]) # 通过iloc方法,同样获取结果 # print(food_info.loc[:3]) # 传入切片,获取若干行 # print(food_info.loc[[2, 3, 5]]) # 以列表的形式传入索引,获取索引对应的数据
Out:
NDB_No 1001
Shrt_Desc BUTTER WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
...
Vit_K_(mcg) 7
FA_Sat_(g) 51.368
FA_Mono_(g) 21.021
FA_Poly_(g) 3.043
Cholestrl_(mg) 215
Name: 0, dtype: object
切片操作
In:
# 切片操作 # food_info.loc[3:6] # 取出3,4,5,6,共四个样本 # food.info.iloc[3:6] # 只能取到3,4,5共三个样本 index = [2, 3, 5] # food_info.loc[index] food_info.iloc[index] # 此时loc和iloc无区别
Out:
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28 99.48 0.00 0.00 0.0 0.00 ... 3069.0 840.0 2.80 1.8 73.0 8.6 61.924 28.732 3.694 256.0
3 1004 CHEESE BLUE 42.41 353 21.40 28.74 5.11 2.34 0.0 0.50 ... 721.0 198.0 0.25 0.5 21.0 2.4 18.669 7.778 0.800 75.0
5 1006 CHEESE BRIE 48.42 334 20.75 27.68 2.70 0.45 0.0 0.45 ... 592.0 174.0 0.24 0.5 20.0 2.3 17.410 8.013 0.826 100.0
3 rows × 36 columns
说明:
loc是根据索引取值,[start: end]表示的范围从start到end
iloc是根据行号取值,[start: end]表示的范围从start到end-1(即不包括end)
In:
# 取某一列数据的三种方法 # 方法1:传入列名,获取该列的值 column = food_info['NDB_No'] print(column.head(3)) print('---' * 10) # 同样以列表形式传入多列的名称,获取多列数据 print(food_info[['NDB_No', 'Water_(g)']].head(3)) print('---' * 10) # 方法2:通过loc方法,逗号左边传入行数(切片),右边传入列名 water_loc = food_info.loc[:, 'Water_(g)'] print(water_loc.head(3)) print('---' * 10) # 方法3:通过iloc方法,逗号左边传入行数(切片),右边传入列号索引 water_iloc = food_info.iloc[:, 2] print(water_iloc.head(3))
Out:
0 1001
1 1002
2 1003
Name: NDB_No, dtype: int64
------------------------------
NDB_No Water_(g)
0 1001 15.87
1 1002 15.87
2 1003 0.24
------------------------------
0 15.87
1 15.87
2 0.24
Name: Water_(g), dtype: float64
------------------------------
0 15.87
1 15.87
2 0.24
Name: Water_(g), dtype: float64
In:
# 获取两列或多列数据的三种方法 cols = ['Protein_(g)', 'Water_(g)'] # 方法一: 直接传入列名(由列名构成的list结构) cols_value = food_info[cols] print(cols_value.head(3)) # 显示三行 print('---' * 10) # 方法二: 通过loc方法,逗号左边传入要取的行数,右边传入列名 cols_loc = food_info.loc[:, cols] print(cols_loc.head(3)) print('---'*10) # 方法三:通过iloc方法,逗号左边传入要取的行数,右边传入列名索引的list cols_iloc = food_info.iloc[:, [2, 4]] print(cols_iloc.head(3))
Out:
Protein_(g) Water_(g)
0 0.85 15.87
1 0.85 15.87
2 0.28 0.24
------------------------------
Protein_(g) Water_(g)
0 0.85 15.87
1 0.85 15.87
2 0.28 0.24
------------------------------
Water_(g) Protein_(g)
0 15.87 0.85
1 15.87 0.85
2 0.24 0.28
In:
# 获取所有列中以g为结尾的列 # 获取所有列名,并使用tolist()方法转化为列表 cols = food_info.columns.tolist() new_col = [] # 对列名进行遍历 for col in cols: if col.endswith('(g)'): # 判断列名是否以g为结尾 new_col.append(col) # 获取所有以g为结尾的列名 new_data = food_info[new_col] # 以列名为索引,取对应的值 new_data.head(3) #显示前3行
Out:
Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g)
0 15.87 0.85 81.11 2.11 0.06 0.0 0.06 51.368 21.021 3.043
1 15.87 0.85 81.11 2.11 0.06 0.0 0.06 50.489 23.426 3.012
2 0.24 0.28 99.48 0.00 0.00 0.0 0.00 61.924 28.732 3.694
3.Pandas数据预处理
基础运算
In:
# 加减乘除操作 # print(food_info['Iron_(mg)']) #对应相除,对每个数据操作 div_1000 = food_info['Iron_(mg)']/1000 print(div_1000.head(3)) print('---'*10) # 对应相加,其他操作类似,依次对应元素操作 add_1000 = food_info['Iron_(mg)'] + 1000 print(add_1000.head(3)) print('---'*10) # 取出数据中的两列,然后相乘。也是对应元素相乘 water_energy = food_info['Iron_(mg)'] * food_info['Water_(g)'] print(water_energy.head(3)) print('---'*10) # 将water_energy列做为新元素列,添加到food_info中是以字典的形式更新 print(food_info.shape) food_info['water_energy'] = water_energy print(food_info.shape)
Out:
0 0.00002
1 0.00016
2 0.00000
Name: Iron_(mg), dtype: float64
------------------------------
0 1000.02
1 1000.16
2 1000.00
Name: Iron_(mg), dtype: float64
------------------------------
0 0.3174
1 2.5392
2 0.0000
dtype: float64
------------------------------
(8618, 36)
(8618, 37)
归一化操作
In:
# 归一化操作 # 用.max()函数,获取列最大值 max_calories = food_info['Energ_Kcal'].max() print(max_calories) print('---'*10) norm_calories = food_info['Energ_Kcal'] / max_calories print(norm_calories.head(3))
Out:
902
------------------------------
0 0.794900
1 0.794900
2 0.971175
Name: Energ_Kcal, dtype: float64
排序操作
In:
# 排序操作 # 参数1:对该列进行排序; # 参数二:是否覆盖该列;默认升序 # 缺失值显示NaN,默认缺失值排在最后 food_info.sort_values('Sodium_(mg)', inplace=True) print(food_info['Sodium_(mg)'].head(3)) print('---'*10) # 指定参数ascending=False,改为降序 food_info.sort_values('Sodium_(mg)', inplace=True, ascending=False) print(food_info['Sodium_(mg)'].head(3)) print('---'*10)
Out:
760 0.0
787 0.0
2270 0.0
Name: Sodium_(mg), dtype: float64
------------------------------
276 38758.0
5814 27360.0
6192 26050.0
Name: Sodium_(mg), dtype: float64
In:
import pandas as pd import numpy as np # 先读取泰坦尼克号收据 titantic_survival = pd.read_csv(r'D:数据分析 2_Pandaspandas itanic_train.csv', engine='python') titantic_survival.head() # 显示前5行
Out:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
缺失值判断
In:
# 读取年龄列 age = titantic_survival['Age'] # 判断age列中的值是否缺失 # 缺省值Nan,返回布尔值; # 缺失值返回True;否则返回False age_is_null = pd.isnull(age) # 以age_is_null 为索引值,获取所有缺失值 age[age_is_null].head # 利用len函数,得到缺失值的数量。 print(len(age[age_is_null]))
Out:
177
In:
# 传入没有缺失值的索引,获取所有没有缺失值的行 good_age = titantic_survival['Age'][age_is_null == False] good_age.head()
Out:
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
Name: Age, dtype: float64
4.Pandas的常用函数
平均值函数
In:
# mean函数获取平均值 mean_age = titantic_survival['Age'].mean() print(mean_age) # 求平均值要求所有值不能有缺失 # mean函数会自动过滤缺失值
Out:
29.69911764705882
缺失值处理函数
In:
# fillna函数对缺失值进行处理 age_series = titantic_survival['Age'] #用平均值替换缺失值 # age_series.fillna(mean_age, inplace=True) # print(age_series.head(3))
In:
# dropna函数对缺失值处理 # 按列删除,删除有缺失值的列 # axis=1按照列删除 drop_nan_cols = titantic_survival.dropna(axis=1) # 按行删除,删除age和sex列中有缺失的行 # axis=0表示按行删除 # subset表示对哪个列进行操作 dropped_nan = titantic_survival.dropna(axis=0, subset=['Age', 'Sex']) dropped_nan.head()
Out:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
透视表操作
In:
# 统计不同等级船舱的平均费用 passenget_classes = [1, 2, 3] # 船舱等级 fares_by_class = {} for this_class in passenget_classes: # 以船舱等级为this_class为索引,得到this_class的所有行 class_rows = titantic_survival[titantic_survival["Pclass"] == this_class] # 从得到数据中,取Fare列对应的所有值 class_fare = class_rows['Fare'] # 取平均 fare_mean = class_fare.mean() # 统计 fares_by_class[this_class] = fare_mean print(fares_by_class)
Out:
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
In:
# 利用pivot_table函数,实现上述功能。 passenger_survival = titantic_survival.pivot_table(index='Pclass', values='Survived', aggfunc=np.mean) print(passenger_survival) # index参数:设置索引,以index给定值为基准进行统计 # values参数:设置值, 要统计的列 # aggfunc参数:设置对values的运算函数,按照哪种方式进行统计
Out:
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
In:
# pivot_table函数,统计船舱中乘客的平均年龄 passenger_age = titantic_survival.pivot_table(index='Pclass', values='Age') print(passenger_age) # 默认aggfunc参数为np.mean
Out:
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
In:
# pivot_table函数,统计登船地点的价格与获救人数多个指标 port_stats = titantic_survival.pivot_table(index='Embarked', values=['Fare', 'Survived'], aggfunc=np.sum) print(port_stats)
Out:
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
In:
# loc函数,获取具体列中的元素 row_index_age = titantic_survival.loc[83, 'Age'] # 获取83号样本的年龄 row_index_cls = titantic_survival.loc[83, 'Pclass'] # 获取83号样本的船舱等级
设置索引
In:
# reset_index函数,设置索引 # 首先利用sort_values按照年龄降序排序 new_titantic = titantic_survival.sort_values('Age', ascending=False) # 利用reset_index函数重新设置索引,并删除缺失值;drop=True删掉之前的索引值 new_titantic_reindex = new_titantic.reset_index(drop=True) print(new_titantic_reindex.loc[:3])
Out:
PassengerId Survived Pclass Name Sex
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
apply函数
In:
# apply函数,自定义函数实现特定功能 def hundred_row(columns): # 获取每列中的第100号样本 item = columns.loc[99] return item # 使用apply函数,传入函数对象,执行函数中的功能 hundred_row = titantic_survival.apply(hundred_row) print(hundred_row) # dataframe.apply(function, axis) 对一行或一列做一些操作 # axis=0时,对dataframe的每一列进行操作 # apply函数对dataframe的每一列传入到function中,并获取返回值 # 和线程池中的apply函数原理相同
Out:
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
In:
# apply函数,定义函数,重新定义船舱等级 def which_class(row): pclass = row['Pclass'] if pd.isnull(pclass): return "Unknow" elif pclass == 1: return "First class" elif pclass == 2: return "Second class" else: return "Third class" classes = titantic_survival.apply(which_class, axis=1) print(classes.head())
Out:
0 Third class
1 First class
2 Third class
3 First class
4 Third class
dtype: object
In:
# apply函数,统计年龄是否成年 def gen_age_label(row): age = row['Age'] if pd.isnull(age): return "Unknow" elif age < 18: return "Minor" else: return "Adult" age_lab = titantic_survival.apply(gen_age_label, axis=1) print(age_lab.head())
Out:
0 Adult
1 Adult
2 Adult
3 Adult
4 Adult
dtype: object
In:
# pivot_table函数,统计成年人获救概率 titantic_survival['age_lab'] = age_lab age_suvival = titantic_survival.pivot_table(index='age_lab', values='Survived') print(age_suvival)
Out:
Survived
age_lab
Adult 0.361183
Minor 0.539823
In:
# apply函数, 统计每一列中缺失值个数
def not_null(columns): # 判断每列中的值是否为缺失值 col_null_index = pd.isnull(columns) # 获取所有缺失值 null = columns[col_null_index] return len(null) # 返回缺失值数量 null_count = titantic_survival.apply(not_null) print(null_count)
Out:
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
5.Pandas的series结构
series
In:
# dataframe中的一行或一列称之为series # series就是dataframe的一个子集 import pandas as pd import numpy as np fandango = pd.read_csv(r"D:数据分析 2_Pandaspandasfandango_score_comparison.csv", engine='python') series_film = fandango['FILM'] print(type(series_film)) print(series_film.head()) print(series_film[:5]) series_rt = fandango['RottenTomatoes'] series_rt[:4]
Out:
<class 'pandas.core.series.Series'> 0 Avengers: Age of Ultron (2015) 1 Cinderella (2015) 2 Ant-Man (2015) 3 Do You Believe? (2015) 4 Hot Tub Time Machine 2 (2015) Name: FILM, dtype: object 0 Avengers: Age of Ultron (2015) 1 Cinderella (2015) 2 Ant-Man (2015) 3 Do You Believe? (2015) 4 Hot Tub Time Machine 2 (2015) Name: FILM, dtype: object
构造series
In:
# 构造一个series结构 from pandas import Series # 拿到电影名, values返回该列的所有值,返回ndarray film_name = series_film.values # 拿到评分值,返回ndarray film_rt = fandango['RottenTomatoes'].values # 构造新series,设置索引为电影名 series_new = Series(film_rt, index=film_name) print(series_new.head()) # 设置索引后,即可按照索引取值,相当于额外增加一条索引 series_new[['Cinderella (2015)', 'Ant-Man (2015)']] # 也可以按照原来的数字索引取值 series_new[:2]
Out:
Avengers: Age of Ultron (2015) 74
Cinderella (2015) 85
Ant-Man (2015) 80
Do You Believe? (2015) 18
Hot Tub Time Machine 2 (2015) 14
dtype: int64
Avengers: Age of Ultron (2015) 74
Cinderella (2015) 85
dtype: int64
注:
利用series_new.loc[:3]会报错!!
因为loc方法的索引可以为整型,字符型,但是这里创建的series结构的时候,指定的index是字符串,不是整型,所以会报错。
series索引
In:
original__index = series_new.index.tolist() # 排序 sorted_index = sorted(original__index) # reindex(或set_index)函数重新设置索引 sorted_by_reindex = series_new.reindex(sorted_index) print(sorted_by_reindex.head())
Out:
'71 (2015) 97 5 Flights Up (2015) 52 A Little Chaos (2015) 40 A Most Violent Year (2014) 90 About Elly (2015) 97 dtype: int64
series基本运算
In:
# series结构基本操作 # 相加,对应位置相加,其他运算类似 print(np.add(series_new, series_new).head()) # 判断(得到True和False做索引) print((series_new > 50).head())
Out:
Avengers: Age of Ultron (2015) 148 Cinderella (2015) 170 Ant-Man (2015) 160 Do You Believe? (2015) 36 Hot Tub Time Machine 2 (2015) 28 dtype: int64 Avengers: Age of Ultron (2015) True Cinderella (2015) True Ant-Man (2015) True Do You Believe? (2015) False Hot Tub Time Machine 2 (2015) False dtype: bool