library(data.table) library(randomForest) data <- iris str(data) #交叉验证,使用rf预测sepal.length k = 5 data$id <- sample(1:k, nrow(data), replace = TRUE) list <- 1:k # 每次迭代的预测用数据框,测试用数据框 # the folds prediction <- data.table() testsetCopy <- data.table() # 写一个进度条,用来了解CV的进度 progress.bar <- create_progress_bar("text") progress.bar$init(k) #k层的函数 for(i in 1:k){ # 删除id为i的行,创建训练集 # 选id为i的行,创建训练集 trainingset <- subset(data, id %in% list[-i]) testset <- subset(data, id %in% c(i)) #运行一个随机森林模型 mymodel <- randomForest(trainingset$Sepal.Length ~ ., data = trainingset, ntree = 100) #去掉回应列1, Sepal.Length temp <- as.data.frame(predict(mymodel, testset[,-1])) # 将迭代出的预测结果添加到预测数据框的末尾 prediction <- rbind(prediction, temp) # 将迭代出的测试集结果添加到测试集数据框的末尾 # 只保留Sepal Length一列 testsetCopy <- rbind(testsetCopy, as.data.frame(testset[,1])) progress.bar$step() } # 将预测和实际值放在一起 result <- cbind(prediction, testsetCopy[, 1]) names(result) <- c("Predicted", "Actual") result$Difference <- abs(result$Actual - result$Predicted) # 用误差的绝对平均值作为评估 summary(result$Difference)
交叉验证伪代码
for each epoch for each training data instance propagate error through the network adjust the weights calculate the accuracy over training data for each validation data instance calculate the accuracy over the validation data if the threshold validation accuracy is met exit training else continue training