之前在kaggle上做了关于房价预测的比赛,现整理如下。
解决问题的大概步骤是:
1、通过画图查看目标变量SalePrice是否偏分布,若是,则进行log(x+1)变换。并查看数值变量,若偏度大于0.75,也做log(x+1)变换
2、缺失值处理。分类变量NA NA值赋值为0,数值变量中的NA赋值为其平均值
3、将分类变量转化为哑变量
4、回归分析。分别用Ridge回归与Lasso回归进行回归分析,因为Lasso回归的误差平均值更小,所以选取Lasso回归的结果作为最终预测结果
具体代码如下:
library(ggplot2) library(plyr) library(dplyr) library(caret) library(moments) library(glmnet) library(elasticnet) library(knitr) train=read.csv("train.csv",stringsAsFactors = F,header = T) test=read.csv("test.csv",stringsAsFactors = F,header = T) all_data=rbind(select(train,MSSubClass:SaleCondition),select(test,MSSubClass:SaleCondition)) #画出售价的直方图,发现左偏,用log(x+1)进行调整 df=rbind(data.frame(version="log(price+1)",x=log(train$SalePrice+1)),data.frame(version="price",x=train$SalePrice)) ggplot(data=df)+facet_wrap(~version,ncol=2,scales="free_x")+geom_histogram(aes(x=x))
train$SalePrice=log(train$SalePrice+1) #数值变量偏度》0.75时做log变换 feature_classes <- sapply(names(all_data),function(x){class(all_data[[x]])}) numeric_feats <-names(feature_classes[feature_classes != "character"]) skewed_feats <- sapply(numeric_feats,function(x){skewness(all_data[[x]],na.rm=TRUE)}) skewed_feats <- skewed_feats[skewed_feats > 0.75] for(x in names(skewed_feats)) { all_data[[x]] <- log(all_data[[x]] + 1) } #分类变量,dummy variables categorical_feats <- names(feature_classes[feature_classes == "character"]) dummies <- dummyVars(~.,all_data[categorical_feats]) categorical_1_hot <- predict(dummies,all_data[categorical_feats]) categorical_1_hot[is.na(categorical_1_hot)] <- 0 #数值变量,用平均值弥补缺失值 numeric_df <- all_data[numeric_feats] for (x in numeric_feats) { mean_value <- mean(train[[x]],na.rm = TRUE) all_data[[x]][is.na(all_data[[x]])] <- mean_value } #预处理后的重建data all_data <- cbind(all_data[numeric_feats],categorical_1_hot) X_train <- all_data[1:nrow(train),] X_test <- all_data[(nrow(train)+1):nrow(all_data),] y <- train$SalePrice #models CARET.TRAIN.CTRL <- trainControl(method="repeatedcv", number=5, repeats=5, verboseIter=FALSE) lambdas <- seq(1,0,-0.001) set.seed(123) # for reproducibility model_ridge <- train(x=X_train,y=y, method="glmnet", metric="RMSE", maximize=FALSE, trControl=CARET.TRAIN.CTRL, tuneGrid=expand.grid(alpha=0, # Ridge regression lambda=lambdas)) ggplot(data=filter(model_ridge$result,RMSE<0.14)) + geom_line(aes(x=lambda,y=RMSE)) mean(model_ridge$resample$RMSE) # [1] 0.1308965 set.seed(123) # for reproducibility model_lasso <- train(x=X_train,y=y, method="glmnet", metric="RMSE", maximize=FALSE, trControl=CARET.TRAIN.CTRL, tuneGrid=expand.grid(alpha=1, # Lasso regression lambda=c(1,0.1,0.05,0.01,seq(0.009,0.001,-0.001), 0.00075,0.0005,0.0001))) model_lasso mean(model_lasso$resample$RMSE) # [1] 0.1260769 coef <- data.frame(coef.name = dimnames(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda))[[1]], coef.value = matrix(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda))) coef <- coef[-1,] picked_features <- nrow(filter(coef,coef.value!=0)) not_picked_features <- nrow(filter(coef,coef.value==0)) cat("Lasso picked",picked_features,"variables and eliminated the other", not_picked_features,"variables ") coef <- arrange(coef,-coef.value) imp_coef <- rbind(head(coef,10), tail(coef,10)) ggplot(imp_coef) + geom_bar(aes(x=reorder(coef.name,coef.value),y=coef.value), stat="identity") + ylim(-1.5,0.6) + coord_flip() + ggtitle("Coefficents in the Lasso Model") + theme(axis.title=element_blank()) preds <- exp(predict(model_lasso,newdata=X_test)) - 1 solution <- data.frame(Id=as.integer(rownames(X_test)),SalePrice=preds) write.csv(solution," lasso_sol.csv",row.names=FALSE)
预测结果如下:
其他的尝试:
尝试将lasso和ridge方法的预测结果,按照权重相加得到最终预测结果,但是结果并不理想,还没有单独lasso的效果好。可能跟设置的权重有关系,但在这个问题上本来ridge的方法就没有lasso好,权重相加本身可能就拉低了lasso预测效果。
听闻XGBoost在此方面效果比较好,或可一试。