Stacked Regressions : Top 4% on LeaderBoard
1. subprocess的check_output模块,用来得到命令行的输出结果
# kaggle代码: 用来输出显示目录下的文件
print(check_output(["ls", "../input"]).decode("utf8"))
# 示例代码: 对命令行输出的结果进行操作
output = subprocess.check_output(["python3", "xx.py"], shell = False)
if (output.find("yes") >= 0): print("yes")
else: print("no")
2. csv操作
train = pd.read_csv('../input/train.csv')
# 显示csv的前五行
train.head(5)
# 丢弃ID列
train.drop("Id", axis = 1, inplace = True)
# 删除特定数据
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
# 连接数据
all_data = pd.concat((train, test)).reset_index(drop=True)
3. 可视化
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
# seaborn可视化数据分布
sns.distplot(train['SalePrice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '
mu = {:.2f} and sigma = {:.2f}
'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($mu=$ {:.2f} and $sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
4. NULL值检查、处理
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)
# 以None代替
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data[col] = all_data[col].fillna('None')
# 以临近值代替
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
lambda x: x.fillna(x.median()))
5. 数据关联性检查
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
6. Label Encoding
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(all_data[c].values))
all_data[c] = lbl.transform(list(all_data[c].values))
# shape
print('Shape all_data: {}'.format(all_data.shape))
7. OneHot Encoding
all_data = pd.get_dummies(all_data)
print(all_data.shape)