筛选空值
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
#print(age.loc[0:10])
#true or false
age_is_null = pd.isnull(age)
#print age_is_null
#value or nan
age_null_true = age[age_is_null]
#print age_null_true
age_null_count = len(age_null_true)
print(age_null_count)
求均值
# 方法一
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print correct_mean_age
# 方法二
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print correct_mean_age
#错误的方法
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print mean_age
数据透视表
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
列名处理
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
按列类型过滤
# 查找类型列
cat_features = list(train.select_dtypes(include=['object']).columns)
print "Categorical: {} features".format(len(cat_features))
# 查找连续数值列
cont_features = [cont for cont in list(train.select_dtypes(
include=['float64', 'int64']).columns) if cont not in ['loss', 'id']]
print "Continuous: {} features".format(len(cont_features))
查看类型变量类别个数
cat_uniques = []
for cat in cat_features:
cat_uniques.append(len(train[cat].unique()))
uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])
类型转换
data['id']=data['id'].astype('int64')
数据筛选
# 在df1,不在df2
df1=df1[~df1['cust_no'].isin(df2['cust_no'].tolist())]
# 日期转时间
data['date']=data['date'].astype("str").apply(lambda x:datetime.strptime(x,'%Y%M%D'))
多个dataframe合并处理
input1=pd.read_csv(path+folder+"01.csv",encoding="utf-8")
input2=pd.read_csv(path+folder+"02.csv",encoding="utf-8")
input3=pd.read_csv(path+folder+"03.csv",encoding="utf-8")
inputs=[input1,input2,input3]
df_all=reduce(lambda left,right:pd.merge(left,right,on="cust_no",how="inner"),inputs)
多个列合并
credit_type = pd.get_dummies(data["credit_type"],drop_first=True,prefix="credit_type")
tran_branch = pd.get_dummies(data["tran_branch"],prefix="branch")
data=pd.concat([data,credit_type,tran_branch],axis=1)