zoukankan      html  css  js  c++  java
  • 机器学习与R语言:C5.0

    #----------------------------------------
    # 功能描述:演示C50建模过程
    # 数据集:汉堡大学信贷模型,信贷数据
    # 
    #----------------------------------------
    #第一步:收集数据
    # import the CSV file
    credit <- read.csv("/Users/chenyangang/R语言/data/credit.csv", stringsAsFactors = TRUE)
    
    # 检查数据
    table(credit$checking_balance)
    table(credit$savings_balance)
    
    # 五数分析法
    summary(credit$months_loan_duration)
    summary(credit$amount)
    
    # 查看分类变量
    table(credit$default)
    
    # 利用随机数来获取训练数据和测试数据,如果需要重复这里的分析,可以使用随机种子set.seed
    set.seed(12345)
    credit_rand <- credit[order(runif(1000)), ]
    
    # 比较数据集
    summary(credit$amount)
    summary(credit_rand$amount)
    head(credit$amount)
    head(credit_rand$amount)
    
    # 分割数据集
    credit_train <- credit_rand[1:900, ]
    credit_test <- credit_rand[901:1000, ]
    
    # 查看分类变量的占比
    prop.table(table(credit_train$default))
    prop.table(table(credit_test$default))
    
    ## 第三步: 训练模型
    
    library(C50)
    #---------------------------------------------
    # 创建分类器:
    # m <- C5.0(train, class, trials = 1, costs = NULL)
    # train: 一个包含训练数据的数据框
    # class: 包含训练数据每一行的分类的一个因子向量
    # trials: 为一个可选数值,用于控制自助法循环的次数(默认为1)
    # costs: 为一个可选矩阵,用于给出与各种类型错误相对应的成本
    # 该函数返回一个C5.0模型对象,该对象可用于预测
    #
    # 进行预测:
    # p <- predict(m, test, type = "class")
    # m: 由C5.0(train, class, trials = 1, costs = NULL) 训练的一个模型
    # test: 一个包含测试数据的数据框,该数据框和用来创建分类器的训练数据有相同的特征
    # type: 取值为“”或者“”标示预测是最可能的类别值或者是原始的预测概率
    # 该函数返回一个向量,根据参数type的取值,该向量含有预测的类别值或者原始的预测概率
    #
    # example:
    # credit_model <- C5.0(credit_train, loan_default)
    # credit_prediction <- predict(credit_model, credit_test)
    #----------------------------------------------
    # 构建决策数据模型
    credit_model <- C5.0(credit_train[-17], credit_train$default)
    
    # 显示决策树模型
    credit_model
    
    # 显示模型详细信息
    summary(credit_model)
    
    ## 第四步: 评估模型性能
    # create a factor vector of predictions on test data
    credit_pred <- predict(credit_model, credit_test)
    
    # cross tabulation of predicted versus actual classes
    library(gmodels)
    CrossTable(credit_test$default, credit_pred,
    prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
    dnn = c('actual default', 'predicted default'))
    
    ## 第五步: 提升模型性能
    
    ## Boosting the accuracy of decision trees
    # boosted decision tree with 10 trials
    credit_boost10 <- C5.0(credit_train[-17], credit_train$default,
    trials = 10)
    credit_boost10
    summary(credit_boost10)
    
    credit_boost_pred10 <- predict(credit_boost10, credit_test)
    CrossTable(credit_test$default, credit_boost_pred10,
    prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
    dnn = c('actual default', 'predicted default'))
    
    # boosted decision tree with 100 trials (not shown in text)
    credit_boost100 <- C5.0(credit_train[-17], credit_train$default,
    trials = 100)
    credit_boost_pred100 <- predict(credit_boost100, credit_test)
    CrossTable(credit_test$default, credit_boost_pred100,
    prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
    dnn = c('actual default', 'predicted default'))
    
    ## Making some mistakes more costly than others
    # create a cost matrix
    error_cost <- matrix(c(0, 1, 4, 0), nrow = 2)
    error_cost
    
    # apply the cost matrix to the tree
    credit_cost <- C5.0(credit_train[-17], credit_train$default,
    costs = error_cost)
    credit_cost_pred <- predict(credit_cost, credit_test)
    
    CrossTable(credit_test$default, credit_cost_pred,
    prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
    dnn = c('actual default', 'predicted default'))
    
    #### Part 2: Rule Learners -------------------
    
    ## Example: Identifying Poisonous Mushrooms ----
    ## Step 2: Exploring and preparing the data ---- 
    mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)
    
    # examine the structure of the data frame
    str(mushrooms)
    
    # drop the veil_type feature
    mushrooms$veil_type <- NULL
    
    # examine the class distribution
    table(mushrooms$type)
    
    ## Step 3: Training a model on the data ----
    library(RWeka)
    
    # train OneR() on the data
    mushroom_1R <- OneR(type ~ ., data = mushrooms)
    
    ## Step 4: Evaluating model performance ----
    mushroom_1R
    summary(mushroom_1R)
    
    ## Step 5: Improving model performance ----
    mushroom_JRip <- JRip(type ~ ., data = mushrooms)
    mushroom_JRip
    summary(mushroom_JRip)
    
    # Rule Learner Using C5.0 Decision Trees (not in text)
    library(C50)
    mushroom_c5rules <- C5.0(type ~ odor + gill_size, data = mushrooms, rules = TRUE)
    summary(mushroom_c5rules)
    

      

  • 相关阅读:
    Django的是如何工作的
    Robot Framework自动化测试(五)--- 开发系统关键字
    Swarm 如何存储数据?- 每天5分钟玩转 Docker 容器技术(103)
    如何滚动更新 Service?- 每天5分钟玩转 Docker 容器技术(102)
    Service 之间如何通信?- 每天5分钟玩转 Docker 容器技术(101)
    神奇的 routing mesh
    如何访问 Service?- 每天5分钟玩转 Docker 容器技术(99)
    Swarm 如何实现 Failover?- 每天5分钟玩转 Docker 容器技术(98)
    如何实现 Service 伸缩?- 每天5分钟玩转 Docker 容器技术(97)
    运行第一个 Service
  • 原文地址:https://www.cnblogs.com/tychyg/p/5345226.html
Copyright © 2011-2022 走看看