zoukankan      html  css  js  c++  java
  • Kaggle电信客户流失预测——基于GBDT融合LR

    package com.fiveonevv.app.Model
    
    import java.io.{FileInputStream, IOException, ObjectInputStream}
    
    import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.tree.GradientBoostedTrees
    import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType}
    import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, Node}
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.SparkSession
    
    class GBDTPreprocessor extends Serializable {
        /**
          *
          * @param node 节点
          * @return 树的叶子节点
          */
        def getLeafNodes(node: Node): Array[Int] = {
            var treeLeafNodes = new Array[Int](0)
            if (node.isLeaf) {
                treeLeafNodes = treeLeafNodes.:+(node.id)
            } else {
                treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get)
                treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get)
            }
            treeLeafNodes
        }
    
        /**
          *
          * @param node 树节点
          * @param features 特征数据
          * @return 返回样本所在的叶节点id
          */
        def predictModify(node: Node, features: Vector): Int = {
            val split = node.split
            if (node.isLeaf) {
                node.id
            } else {
                // 判断是连续或者离散特征
                if (split.get.featureType == FeatureType.Continuous) {
                    if (features(split.get.feature) <= split.get.threshold) {
                        predictModify(node.leftNode.get, features)
                    } else {
                        predictModify(node.rightNode.get, features)
                    }
                } else {
                    if (split.get.categories.contains(features(split.get.feature))) {
                        predictModify(node.leftNode.get, features)
                    } else {
                        predictModify(node.rightNode.get, features)
                    }
                }
            }
        }
    
        def gbtTrain(gbtTrainData: RDD[LabeledPoint], numTrees: Int): (GradientBoostedTreesModel, Array[Array[Int]]) = {
            val boostingStrategy = BoostingStrategy.defaultParams("Classification")
            boostingStrategy.setNumIterations(numTrees)
            val gbdtModel = GradientBoostedTrees.train(gbtTrainData, boostingStrategy)
            val treeLeafArray = new Array[Array[Int]](numTrees)
    
            for (i <- 0.until(numTrees)) {
                // 获取所有树的叶子节点
                treeLeafArray(i) = getLeafNodes(gbdtModel.trees(i).topNode)
            }
            (gbdtModel, treeLeafArray)
        }
    
    
        /**
          *
          * @param gbtTestData 需要生成特征的数据
          * @param gbtModel gbt模型
          * @param treeLeafArray gbt模型树的所有叶子节点
          * @param numTrees 树的数量
          * @return
          */
        def gbtFeaturePredict(gbtTestData: RDD[(String, (Double, DenseVector))], gbtModel: GradientBoostedTreesModel, treeLeafArray: Array[Array[Int]], numTrees: Int): RDD[(String, LabeledPoint)] = {
            val newFeature = gbtTestData.map(line => {
                var gbtFeatures = new Array[Double](0)
                for (i <- 0.until(numTrees)) {
                    val treePredict = predictModify(gbtModel.trees(i).topNode, line._2._2)
                    val leafArray = new Array[Double]((gbtModel.trees(i).numNodes + 1) / 2)  // 完全二叉树叶节点的数量
                    // 将叶子节点处置为1
                    leafArray(treeLeafArray(i).indexOf(treePredict)) = 1  // 输入样本落入叶子节点的位置
                    gbtFeatures = gbtFeatures ++ leafArray
                }
                (line._1, line._2._1, gbtFeatures)  // id, label, gbtFeatures
            })
            val gbtFeatureRDD = newFeature.map(
                x => (x._1, LabeledPoint(x._2, Vectors.dense(x._3)))
            )
            gbtFeatureRDD
        }
    
        /**
          *
          * @param data 标签
          * @param model 模型
          * @param isAppend
          * @return G B D T 构造新的特征
          */
        def getNodeListWithGBDT(data: RDD[LabeledPoint], model: GradientBoostedTreesModel, spark: SparkSession, isAppend: Boolean): Option[RDD[LabeledPoint]] = {
            val numTrees = model.numTrees
            // 存放每棵树的叶子节点编号
            val treeLeafArray = new Array[Array[Int]](numTrees)
            for (i <- 0.until(numTrees)) {
                treeLeafArray(i) = getLeafNodes(model.trees(i).topNode)
            }
            // 构造新的特征
            val newData:RDD[LabeledPoint] = data.map(line => {
                var newFeatures = new Array[Double](0)
                for (i <- 0.until(numTrees)) {
                    // 获取特征所在的节点编号
                    val treePredict = predictModify(model.trees(i).topNode, line.features)
                    val treeArray = new Array[Double]((model.trees(i).numNodes + 1) / 2)
                    treeArray(treeLeafArray(i).indexOf(treePredict)) = 1
                    newFeatures = newFeatures ++ treeArray
                }
                if (isAppend) {
                    new LabeledPoint(line.label, Vectors.dense(newFeatures ++ line.features.toArray))
                } else {
                    new LabeledPoint(line.label, Vectors.dense(newFeatures))
                }
            })
            Option(newData)
        }
    
        def loadModel(path: String): Option[GradientBoostedTreesModel] = {
            try {
                val in = new ObjectInputStream(new FileInputStream(path))
                val model = Option(in.readObject().asInstanceOf[GradientBoostedTreesModel])
                in.close()
                model
            } catch {
                case ex: ClassNotFoundException =>
                    println(ex.printStackTrace())
                    None
                case ex: IOException =>
                    println(ex.printStackTrace())
                    None
                case _: Throwable =>
                    throw new Exception
            }
        }
    }
    
    package com.fiveonevv.app.Model
    
    import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
    import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
    import org.apache.spark.ml.feature._
    import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
    import org.apache.spark.ml.{Pipeline, PipelineModel}
    import org.apache.spark.mllib.evaluation.MulticlassMetrics
    import org.apache.spark.mllib.linalg.DenseVector
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.{DataFrame, SparkSession}
    
    class GBDTLRModelProcess {
        /**
          * 本地读取数据预处理,处理成labeledPoint和DenseVector
          * @param rdd 本地读取txt数据 包含features,label
          * @return denseVectorRDD
          */
        def localDataProcess(rdd:RDD[String]): RDD[(String, LabeledPoint, LabeledPoint, (Double, DenseVector))] = {
            val denseVectorRDD = rdd.map{
                line =>{
                    val arr = line.toString.split("	")
                    val userInfo = arr(0)
                    val nonFeatures = arr(1).split("#").map(_.toDouble)
                    val features = arr(2).split("#").map(_.toDouble)
                    val label = arr(3).toDouble
                    //创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
                    (userInfo,LabeledPoint(label, new DenseVector(features)), LabeledPoint(label, new DenseVector(nonFeatures)),
                      (label, new DenseVector(nonFeatures)))
                }
            }
            denseVectorRDD
        }
    
        /**
          * yarn集群读取hive数据预处理,处理成labeledPoint和DenseVector
          * @param rdd 读取hive dataFrame转换成rdd
          * @return denseVectorRDD
          */
        def hiveDataProcess(rdd:RDD[(String, Array[Double], Array[Double], String)]): RDD[(String, LabeledPoint, LabeledPoint,
          (Double, DenseVector))] = {
    
            val denseVectorRDD = rdd.map{
                line => {
                    val userInfo = line._1
                    val numFeatures = line._2  // 数值型特征
                    val cateFeatures = line._3 // 类别型特征
                    val label = line._4.toDouble
                    //创建一个稠密向量,labeledPoint格式GBT模型使用,后一组数据给特征离散化使用
                    (userInfo,
                      LabeledPoint(label, new DenseVector(cateFeatures)),
                      LabeledPoint(label,new DenseVector(numFeatures)),
                      (label, new DenseVector(numFeatures)))
                }
            }
            denseVectorRDD
        }
    
    
        /**
          * 用gbdt将连续型的特征离散化处理
          * @param train 训练用数据
          * @param test  测试用数据
          * @return 离散化处理后的训练集和测试集
          */
        def gbtFeatureProcess(train:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
                             test:RDD[(String,LabeledPoint,LabeledPoint,(Double,DenseVector))],
                             spark:SparkSession): (DataFrame, DataFrame) = {
            // 离散特征
            val trainRDD = train.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
            val testRDD = test.map(x => (x._1,x._2)).map(x => ((x._1,x._2.label),x._2.features.asML))
            // 连续型特征
            val gbtTrain = train.map(x => x._3)
            val gbtTrainData = train.map(x => (x._1,x._4))
            val gbtTestData = test.map(x => (x._1,x._4))
            // 连续特征离散化处理
            val gbdtPreprocessor = new GBDTPreprocessor
            val numTrees = 10
            // treeLeafArray所有树的叶子节点
            val (gbtModel, treeLeafArray) = gbdtPreprocessor.gbtTrain(gbtTrain,numTrees)
            val gbtTrainRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTrainData,gbtModel,treeLeafArray,numTrees)
              .map(x => ((x._1,x._2.label),x._2.features.asML))
            val allTrainRDD = trainRDD.join(gbtTrainRDD)
            val trainDF = spark.createDataFrame(allTrainRDD.map(x => (
              x._1._1,
              x._1._2,
              x._2._1,
              x._2._2)))
              .toDF("userInfo","label","feature1","feature2")
    
            val gbtTestRDD = gbdtPreprocessor.gbtFeaturePredict(gbtTestData,gbtModel,treeLeafArray,numTrees)
              .map(x => ((x._1,x._2.label),x._2.features.asML))
            val allTestRDD = testRDD.join(gbtTestRDD)
            val testDF = spark.createDataFrame(allTestRDD.map(x => (
              x._1._1,
              x._1._2,
              x._2._1,
              x._2._2
            )))
              .toDF("userInfo","label","feature1","feature2")
            (trainDF,testDF)
        }
    
    
        /**
          * 构建管道训练流程:归一化、特征选择、网格搜索
          * @param data 训练集
          * @return pipelineModel
          */
        def pipelineTrain(data:DataFrame): PipelineModel = {
            data.persist()
            val featureScaler = new MinMaxScaler()
              .setInputCol("features")
              .setOutputCol("scaledFeatures")
            val featureSelector = new ChiSqSelector()
              .setFeaturesCol("scaledFeatures")
              .setLabelCol("label")
              .setNumTopFeatures(80)
              .setOutputCol("selectedFeatures")
            val lr = new LogisticRegression()
              .setMaxIter(200)
              .setElasticNetParam(1.0)
              .setRegParam(0.001)
              .setThreshold(0.5)
              .setLabelCol("label")
              .setFeaturesCol("selectedFeatures")
            // build pipeline
            val pipeline = new Pipeline()
              .setStages(Array(featureScaler,featureSelector,lr))
            // 网格搜索:特征数量、正则化系数、弹性网络参数、迭代次数
            val paramGrid = new ParamGridBuilder()
              .addGrid(featureSelector.numTopFeatures,Array(70))
              .addGrid(lr.maxIter,Array(100))
              .addGrid(lr.elasticNetParam,Array(1.0,0.0))
              .addGrid(lr.regParam,Array(0.00075))
              .build()
            // 交叉验证
            val cv = new CrossValidator()
              .setEstimator(pipeline)
              .setEvaluator(new BinaryClassificationEvaluator())
              .setEstimatorParamMaps(paramGrid)
              .setNumFolds(5)
            val cvModel = cv.fit(data)
            val pipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
            data.unpersist()
            pipelineModel
        }
    
    
        /**
          * pipeline的中间计算结果
          * @return 归一化结果、特征选择结果、lr分类结果
          */
        def pipelinePredict(data: DataFrame,pipelineModel: PipelineModel): (DataFrame, DataFrame, DataFrame) = {
            data.persist()
            val featureScaleModel = pipelineModel.stages(0).asInstanceOf[MinMaxScalerModel]
            val chiSqSelectorModel = pipelineModel.stages(1).asInstanceOf[ChiSqSelectorModel]
            val lrModel = pipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
    
            println("特征选择个数:",chiSqSelectorModel.explainParam(chiSqSelectorModel.numTopFeatures))
            println("LR迭代次数:",lrModel.explainParam(lrModel.maxIter))
            println("LR正则化系数:",lrModel.explainParam(lrModel.regParam))
            println("LR分类阈值:",lrModel.explainParam(lrModel.threshold))
            println("L1L2正则比例:",lrModel.explainParam(lrModel.elasticNetParam))
            println("LR特征个数:",lrModel.numFeatures)
            val scaledData = featureScaleModel.transform(data)          //归一化
            val selectedData = chiSqSelectorModel.transform(scaledData) //特征选择
            val predictions = lrModel.transform(selectedData)           //lr预测
    
            data.unpersist()
            (scaledData,selectedData,predictions)
        }
    
        /**
          * 特征合并
          * @param data 数据集dataFrame 包含features1和features2
          * @return 合并后的features的数据集
          */
        def featureAssembler(data:DataFrame):DataFrame ={
            val assembler = new VectorAssembler()
              .setInputCols(Array("feature1", "feature2"))
              .setOutputCol("features")
            val output = assembler.transform(data)
            output
        }
    
    
        /**
          * 评估模型的效果
          * @return 准确率、加权精确率、加权召回率、F1值
          */
        def multiClassEvaluate(data: RDD[(Double,Double)]): (Double,Double,Double,Double) = {
            val metrics = new MulticlassMetrics(data)
            val accuracy = metrics.accuracy
            val weightedPrecision = metrics.weightedPrecision
            val weightedRecall = metrics.weightedRecall
            val f1 = metrics.weightedFMeasure
            (accuracy,weightedPrecision,weightedRecall,f1)
        }
    }
    
    package com.fiveonevv.app.core
    
    import com.fiveonevv.app.Model.GBDTLRModelProcess
    import com.fiveonevv.app.util.SparkSqlUtil
    import org.apache.log4j.{Level, Logger}
    import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
    import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
    import org.apache.spark.ml.{Pipeline, PipelineStage}
    import org.apache.spark.sql.functions.{udf, _}
    
    import scala.collection.mutable.ListBuffer
    
    object GBDTLrTrain {
        Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
        Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
        def main(args: Array[String]): Unit = {
            val spark = SparkSqlUtil.initSparkSession(SparkSqlUtil.initSparkBuilder(),"GBDTLRTrainDemo")
            // 从hive读取数据
            val rawDF = spark
              .sql("""SELECT * FROM tmp.telco_churn""")
              .na.fill(0.0,Seq("TotalCharges"))
            // 类别型字段和数值型字段
            val cateCols = Array("gender","partner","dependents","phone_service","multiple_lines","internet_service","online_security",
                "online_backup","device_protection","tech_support","streaming_tv","streaming_movies","paperless_billing","payment_method")
            val numCols = Array("senior_citizen","tenure","monthly_charges","total_charges")
            // 建立类别索引
            val indexer = cateCols.map(colName => new StringIndexer().setInputCol(colName).setOutputCol(s"${colName}Index"))
            //val encoder = new OneHotEncoderEstimator().setInputCols(indexCols).setOutputCols(cateCols map (name => s"${name}Vec"))
            // 合并类别型特征
            val cateAssembler = new VectorAssembler().setInputCols(cateCols.map(_ + "Index")).setOutputCol("cateFeatures")
            // 合并数值型特征
            val numAssembler = new VectorAssembler().setInputCols(numCols).setOutputCol("numFeatures").setHandleInvalid("skip")
            val stagesArray = new ListBuffer[PipelineStage]()
            for (stringIndexer <- indexer) {
                stagesArray.append(stringIndexer)
            }
            stagesArray.append(cateAssembler,numAssembler)
            val dataPrePipeline = new Pipeline().setStages(stagesArray.toArray)
            // pipeline转换的结果中混杂了稀疏向量和稠密向量,统一转换为稠密向量
            val toDense = udf((v: org.apache.spark.ml.linalg.Vector) => v.toDense)
            val processedRDD = dataPrePipeline.fit(rawDF).transform(rawDF)
              .selectExpr("customerid","numFeatures","cateFeatures","case when churn = 'Yes' then 1.0 else 0.0 end as label")
              .withColumn("cateDenseFeatures",toDense(col("cateFeatures")))
              .selectExpr("customerid","numFeatures","cateDenseFeatures cateFeatures","label")
              .rdd.map(x => (
                x(0).toString,
                // ml向量不能直接转换为mllib向量,先转成Array然后再转成mllib的稠密向量
                x(1).asInstanceOf[org.apache.spark.ml.linalg.Vector].toArray,
                x(2).asInstanceOf[org.apache.spark.ml.linalg.DenseVector].toArray,
                x(3).toString)
            )
    
            val Array(trainRDD, testRDD) = processedRDD.randomSplit(weights=Array(0.7,0.3),1234)
            val modelProcess = new GBDTLRModelProcess
            val denseVectorTrainRDD = modelProcess.hiveDataProcess(trainRDD)
            val denseVectorTestRDD = modelProcess.hiveDataProcess(testRDD)
    
            //gbt训练 将连续型特征离散化并和原离散特征合并成新特征
            val (gbtFeatureTrainDF, gbtFeatureTestDF) = modelProcess.gbtFeatureProcess(denseVectorTrainRDD, denseVectorTestRDD, spark)
            val unionTrainDF = modelProcess.featureAssembler(gbtFeatureTrainDF) //gbt离散化后特征合并原特征
            val unionTestDF = modelProcess.featureAssembler(gbtFeatureTestDF)
    
            //训练数据上采样 正样本复制2倍
            val positiveDF = unionTrainDF.filter("label=1")
            val negativeDF = unionTrainDF.filter("label=0")
            val upPositiveDF = positiveDF//.union(positiveDF).union(positiveDF)
            val upSampleDF = negativeDF.union(upPositiveDF)
    
            //管道训练和预测
            val pipelineModel = modelProcess.pipelineTrain(upSampleDF)
            val (scaledDF, selectedDF, predictions) = modelProcess.pipelinePredict(unionTestDF, pipelineModel)
    
            // 评估模型效果
            predictions.select("customerid","label","rawPrediction","probability","prediction").show(50)
            val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
            val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
            val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)
    
            // 检查模型在测试集上的表现
            val lp = predictions.select( "label", "prediction")
            val countTotal = predictions.count()
            val correct = lp.filter(lp("label") === lp("prediction")).count()  // 预测正确的样本数量
            lp.show(200)
            val ratioCorrect = correct.toDouble / countTotal.toDouble
    
            // 1 流失 0 留存
            val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count()  // 真流失用户
            val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count()  // 假流失用户
            val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count()  // 真留存用户
            val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count()  // 假留存用户
    
            // 真正例率、假正例率
            val tpr = truePositive.toDouble / (truePositive + falseNegative)
            val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
            // 流失用户查准率
            val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
            // 流失用户召回率
            val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
            // 留存用户查准率
            val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
            // 留存用户召回率
            val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
            println(s"预测样本总数: $countTotal")
            println(s"正确预测样本数量: $correct")
            println(s"模型准确率: $ratioCorrect")
            println(s"模型ROC值:$areaUnderROC")
            println(s"模型PR值:$areaUnderPR")
            println(s"预测结果中真流失用户个数:$truePositive")
            println(s"预测结果中假流失用户个数:$falsePositive")
            println(s"预测结果中真流失用户比例: $tpr")
            println(s"预测结果中假流失用户比例: $fpr")
            println(s"流失用户查准率:$positivePrecision")
            println(s"流失用户召回率:$positiveRecall")
            println(s"留存用户查准率:$negativePrecision")
            println(s"留存用户召回率:$negativeRecall")
            spark.stop()
        }
    }
    
    scala> val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
    evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_f0b527f4e73d
    
    scala> val areaUnderROC = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
    areaUnderROC: Double = 0.8306899086101781                                       
    
    scala> val areaUnderPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)
    areaUnderPR: Double = 0.6296575868466127                                        
    
    scala> val lp = predictions.select( "label", "prediction")
    lp: org.apache.spark.sql.DataFrame = [label: double, prediction: double]
    
    scala> val countTotal = predictions.count()
    countTotal: Long = 2095
    
    scala> val truePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") === lp("prediction")).count()  // 真流失用户
    truePositive: Long = 270                                                        
    
    scala> val falsePositive = lp.filter(lp("prediction") === 1.0).filter(lp("label") =!= lp("prediction")).count()  // 假流失用户
    falsePositive: Long = 146                                                       
    
    scala> val trueNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") === lp("prediction")).count()  // 真留存用户
    trueNegative: Long = 1397
    
    scala> val falseNegative = lp.filter(lp("prediction") === 0.0).filter(lp("label") =!= lp("prediction")).count()  // 假留存用户
    falseNegative: Long = 282
    
    scala> val tpr = truePositive.toDouble / (truePositive + falseNegative)
    tpr: Double = 0.4891304347826087
    
    scala> val fpr = falsePositive.toDouble / (falsePositive + trueNegative)
    fpr: Double = 0.09462086843810759
    
    scala> val positivePrecision = truePositive.toDouble / (truePositive + falsePositive)
    positivePrecision: Double = 0.6490384615384616
    
    scala> val positiveRecall = truePositive.toDouble / (truePositive + falseNegative)
    positiveRecall: Double = 0.4891304347826087
    
    scala> val negativePrecision = trueNegative.toDouble / (trueNegative + falseNegative)
    negativePrecision: Double = 0.8320428826682549
    
    scala> val negativeRecall = trueNegative.toDouble / (trueNegative + falsePositive)
    negativeRecall: Double = 0.9053791315618924
    
    scala> println(s"预测样本总数: $countTotal")
    预测样本总数: 2095
    
    scala> println(s"正确预测样本数量: $correct")
    正确预测样本数量: 1667
    
    scala> println(s"模型准确率: $ratioCorrect")
    模型准确率: 0.7957040572792363
    
    scala> println(s"模型ROC值:$areaUnderROC")
    模型ROC值:0.8306899086101781
    
    scala> println(s"模型PR值:$areaUnderPR")
    模型PR值:0.6296575868466127
    
    scala> println(s"预测结果中真流失用户个数:$truePositive")
    预测结果中真流失用户个数:270
    
    scala> println(s"预测结果中假流失用户个数:$falsePositive")
    预测结果中假流失用户个数:146
    
    scala> println(s"预测结果中真流失用户比例: $tpr")
    预测结果中真流失用户比例: 0.4891304347826087
    
    scala> println(s"预测结果中假流失用户比例: $fpr")
    预测结果中假流失用户比例: 0.0946208
    
  • 相关阅读:
    Python学习摘要201802
    机器学习-梯度下降参数调优小结
    用尽洪荒之力学习Flask源码
    Flask类的属性和方法大全
    Flask第三方工具组件介绍
    Flask自带的常用组件介绍
    Centos下部署Flask
    Python Tips阅读摘要
    web程序设计关于我们
    软工实践总结
  • 原文地址:https://www.cnblogs.com/swordspoet/p/14683122.html
Copyright © 2011-2022 走看看