参考:http://blog.csdn.net/ppp8300885/article/details/77934822?locationnum=9&fps=1
长尾数据的log转换:
fig, axes = plt.subplots(nrows=2, ncols=1) df['travel_time'].hist(bins=100, ax=axes[0]) df['travel_time'] = np.log1p(df['travel_time']) df['travel_time'].hist(bins=100, ax=axes[1]) plt.show()
数据平滑处理:
from sklearn import datasets import pandas as pd import numpy as np import matplotlib.pyplot as plt iris=datasets.load_iris() name=iris.feature_names iris=pd.DataFrame(iris.data) iris.columns=name def quantile_clip(group): group.plot() group[group < group.quantile(.05)] = group.quantile(.05) group[group > group.quantile(.95)] = group.quantile(.95) group.plot() plt.show() return group iris['sepal length (cm)'].transform(quantile_clip)
id和time的笛卡尔积转换,并进行缺失值填补
link_df = pd.read_csv('../raw/gy_contest_link_info.txt', delimiter=';', dtype={'link_ID': object}) date_range = pd.date_range("2016-07-01 00:00:00", "2016-07-31 23:58:00", freq='2min').append(pd.date_range("2017-04-01 00:00:00", "2017-07-31 23:58:00", freq='2min')) new_index = pd.MultiIndex.from_product([link_df['link_ID'].unique(),date_range],names=['link_ID', 'time_interval_begin']) df1 = pd.DataFrame(index=new_index).reset_index() #在表合并之后,需要对index进行reset化,使得index按顺序排列 df3 = pd.merge(df1, df, on=['link_ID', 'time_interval_begin'], how='left') #将已有数据对笛卡尔积数据进行左连接