今天打个卡, 还不错,学到20课了, 简单的把pandas的操作过一遍, 这没有numpy学的好
1. 读取csv文件
import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
print(food_info.dtypes)
# print(help(pandas.read_csv))
2. 读取头文件 .head() 可以设置行数 和 尾部 也可以设置行数
# food_info.head(3)
# first_rows = food_info.head(4)
# first_rows
# food_info.tail(4)
# print(food_info.columns) #打印列元素
print(food_info.shape) #打印类型
3. 读取行操作 .loc[ : ]
# print(food_info.loc[0])
# print(food_info.loc[3:6])
print(food_info.loc[[2,5,10]])
4. 读取列元素
year_col = food_info["Year"]
print(year_col)
5. 筛选 尾部是t的元素 跟python操作是一致的
col_names = food_info.columns.tolist()
print(col_names)
t_col = []
for c in col_names:
if c.endswith("t"):
t_col.append(c)
t_df = food_info[t_col]
print(t_df.head(3))
6. 列 元素的四则运算 保证维度一致
print(food_info["Product cost"])
div_1000 = food_info["Product cost"] / 1000
print(div_1000)
7. 对列元素进行排序, 默认升序 sort_values( )
food_info.sort_values("Product cost", inplace = True)
print(food_info["Product cost"])
food_info.sort_values("Product cost", inplace = True, ascending = False)
print(food_info["Product cost"])
8. 用numpy 和pandas 做一个简单的数据分析
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
9. 将age 中为空的选择出来
age = titanic_survival["Age"]
# print(age.loc[0:10])
age_is_null = pd.isnull(age)
# print(age_is_null)
age_null_true = age[age_is_null]
# print(age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)
10. 找出age的平均值 有NAN值没办法求
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)
11. 将age 为空的筛选掉, 这个比较重要
good_ages = titanic_survival["Age"][age_is_null == False]
# print(good_ages)
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
12. 找出age的平均值
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
13. 找出1,2,3 的平均票价 与python一样
passenger_classes = [1, 2, 3]
fares_by_classes = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_classes[this_class] = fare_for_class
print(fares_by_classes)
14. 简单操作 找出平均票价
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
15 求平均年龄
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age", aggfunc=np.mean)
print(passenger_age)
16求和
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_stats)
17 去掉空值
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["Age", "Sex"])
print(new_titanic_survival)
18 找出特定值
row_index_83_age = titanic_survival.loc[83, "Age"]
row_index_1000_pclass = titanic_survival.loc[766, "Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
19 排序
new_titanic_survival = titanic_survival.sort_values("Age", ascending=False)
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print("---------------------------------------------------------")
print(titanic_reindexed.loc[0:10])
20 将自己的函数与数据结合起来
def hundredth_row(column):
hundredth_item = column.loc[99]
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
21 求每一个列对应的空值元素个数
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
22. 换标签找出生存与年龄的关系
titanic_survival["age_labels"] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", value="Survived")
print(age_group_survival)