zoukankan      html  css  js  c++  java
  • 逻辑回归 评价推荐质量 计算AUC

    有两个概念需要弄清楚:
    1 ROC
    2 AUC
    
    package org.apache.spark.mllib.classification
    
    import org.apache.log4j.Logger
    import org.apache.log4j.Level
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.util.MLUtils
    import org.apache.spark.rdd.RDD
    import org.apache.spark.mllib.linalg.BLAS.dot
    import org.apache.spark.{SparkConf, SparkContext}
    
    
    /**
      * Created by root on 2016/9/24.
      * 评价推荐质量 :
      *
      */
    object auc {
    
      def main(args: Array[String]) {
        Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    
    
        val conf = new SparkConf().setAppName("auc").setMaster("local[*]")
        val sc = new SparkContext(conf)
        //原数据格式---> LogisticRegressionDataGenerator 生成的数据 <master> <output_dir> <num_examples> <num_features> <num_partitions>
        //local path 100000 100 1
        //(0.0,[1.1419053154730547,0.9194079489827879,-0.9498666368908959,-1.1069902863993377,0.2809776380727795,0.6846227956326554,-0.8172214073987268,-1.3966434026780434,-0.19094451307087512,1.4862133923906502,0.8023071496873626,-0.12151292466549345,1.4105062239438624,-0.6402327822135738,-1.2096444592532913,0.35375769787202876,-0.4903496491990076,0.5507215382743629,-1.2035510019650835,0.3210160806416416,1.5511476388671834,0.43853028624710505,0.4815980608245389,1.5196310789680683,-0.2768317291873249,-0.08393897849486337,1.255833005788796,-0.3252727938665772,-0.17329033306108363,-1.8585851445864527,1.4238069456328435,-1.363726024075023,-1.964666098753878,-0.9185948439341892,-2.548887393384806,-1.6309606578419305,-0.12200477461989162,1.289159071801577,-0.2691388556559934,0.2574914085090889,-0.3199143760045327,-1.7684998592513064,-0.4834503128592458,-0.5099904653893699,1.1166733769661994,-0.04094720151728288,-1.1076715169200795,1.8623214176471945,1.1457411377091524,-1.0586772048930921,1.0725991339400673,-1.9317441520296659,0.30102521611534994,0.2475231582804265,1.406156849249087,-1.5202207203569256,0.2709294126920897,0.561249284813777,-0.5298295780368607,0.5390221914988275,2.2123402141787243,-0.6329335687728442,-1.8831759122084633,0.3865659853763343,0.32582927090649455,-0.9013043195000002,-0.002680308907617573,-0.4739592549853249,-0.5479781547659026,-0.01910014847196348,1.6468163882596327,-1.107062592215791,0.5938103926672539,-0.15566462108511642,0.6632872929286855,1.226793360688623,0.8839698437730904,0.22172454670212935,0.9197020859698617,-0.7393758185888677,0.803517749531419,-0.2539417447630359,-0.7638388605060555,-1.8645567427274516,-1.861306200027518,-0.576599881116305,-0.40899380621224757,0.24846093761654187,-0.48091295490277447,0.44621205735391023,-0.4465888888803913,0.045638687865053575,0.7045663273135641,-0.2718240183671583,0.08074877915238832,1.2590964696340183,0.7635098382407334,1.7220810801509723,0.14595005405372477,-0.9946630124621867])
        //(1.0,[3.45334487943329,3.703049287361516,2.5130165929545307,3.0800075574677215,3.171627421031228,4.510723893197076,2.6581375743525686,3.1156651789533387,4.47340262637963,2.4049108266235555,4.255158010564988,2.3258025009719385,2.887640312921763,3.982413153985827,4.225500378590178,3.153701048234778,3.3226444299363393,3.1134293366028674,2.7082537229307286,4.181225794555864,2.4777291910879735,3.3541041623501155,2.509535608654829,3.6520241921439593,2.7357293459939838,3.7219713080774572,1.79978608558576,2.1490013130808725,3.0506491504672213,4.282604805576632,2.3453302124452065,4.765552217918703,2.576396589784987,2.2122195494057304,3.777122545381258,1.2790576486032348,2.6425654857274985,3.212018531982713,5.001124074462854,3.1785870770834213,3.9658649468050955,3.991180264704203,1.950852602272741,3.7912958928979705,1.739948405239964,2.9715486731415477,2.6257571982670096,1.826757810204818,3.3903795303152915,2.641366174472212,2.268682514813932,2.302301777328137,1.318053862077372,1.4887538557215252,4.596202655233997,2.708452851966949,2.535659883929069,2.159712854763802,4.3592175241886935,1.2835999065428711,3.8212126122949757,0.9923739981417254,1.2082442788719718,3.09057936644558,2.0470303995207977,2.281734506191289,4.088998687981933,3.3761246055228167,1.5022126296834528,1.6766482086025092,3.0925674715305713,1.6349647008447825,3.204433721898702,2.127761369019482,3.4994059501116883,4.407179021668299,3.257583684085755,3.313665712868691,2.0566775385443052,4.3593727689426345,2.521146674716752,3.261467251430135,1.0213751954482944,3.181153154383373,4.502250899343585,3.2313429276554966,2.8643146434649607,2.6234099229109504,3.7240089562669496,2.7780075804328814,2.709132808617267,3.2789299190580423,2.721715981494141,4.2277698250773055,3.979624410666172,3.3718544694471473,0.7743217015416155,3.61345808170633,2.3576880804445057,3.6785246541125773])
    
        val data: RDD[LabeledPoint] =MLUtils.loadLabeledPoints(sc,"D:\bigdataworkspaces\dataout\dd\part-00000")
        //loadlabledPoints
        //(0.0,[1.1419053154730547,0.9194079489827879,-0.9498666368908959,-1.1069902863993377,0.2809776380727795,0.6846227956326554,-0.8172214073987268,-1.3966434026780434,-0.19094451307087512,1.4862133923906502,0.8023071496873626,-0.12151292466549345,1.4105062239438624,-0.6402327822135738,-1.2096444592532913,0.35375769787202876,-0.4903496491990076,0.5507215382743629,-1.2035510019650835,0.3210160806416416,1.5511476388671834,0.43853028624710505,0.4815980608245389,1.5196310789680683,-0.2768317291873249,-0.08393897849486337,1.255833005788796,-0.3252727938665772,-0.17329033306108363,-1.8585851445864527,1.4238069456328435,-1.363726024075023,-1.964666098753878,-0.9185948439341892,-2.548887393384806,-1.6309606578419305,-0.12200477461989162,1.289159071801577,-0.2691388556559934,0.2574914085090889,-0.3199143760045327,-1.7684998592513064,-0.4834503128592458,-0.5099904653893699,1.1166733769661994,-0.04094720151728288,-1.1076715169200795,1.8623214176471945,1.1457411377091524,-1.0586772048930921,1.0725991339400673,-1.9317441520296659,0.30102521611534994,0.2475231582804265,1.406156849249087,-1.5202207203569256,0.2709294126920897,0.561249284813777,-0.5298295780368607,0.5390221914988275,2.2123402141787243,-0.6329335687728442,-1.8831759122084633,0.3865659853763343,0.32582927090649455,-0.9013043195000002,-0.002680308907617573,-0.4739592549853249,-0.5479781547659026,-0.01910014847196348,1.6468163882596327,-1.107062592215791,0.5938103926672539,-0.15566462108511642,0.6632872929286855,1.226793360688623,0.8839698437730904,0.22172454670212935,0.9197020859698617,-0.7393758185888677,0.803517749531419,-0.2539417447630359,-0.7638388605060555,-1.8645567427274516,-1.861306200027518,-0.576599881116305,-0.40899380621224757,0.24846093761654187,-0.48091295490277447,0.44621205735391023,-0.4465888888803913,0.045638687865053575,0.7045663273135641,-0.2718240183671583,0.08074877915238832,1.2590964696340183,0.7635098382407334,1.7220810801509723,0.14595005405372477,-0.9946630124621867])
        //(1.0,[3.45334487943329,3.703049287361516,2.5130165929545307,3.0800075574677215,3.171627421031228,4.510723893197076,2.6581375743525686,3.1156651789533387,4.47340262637963,2.4049108266235555,4.255158010564988,2.3258025009719385,2.887640312921763,3.982413153985827,4.225500378590178,3.153701048234778,3.3226444299363393,3.1134293366028674,2.7082537229307286,4.181225794555864,2.4777291910879735,3.3541041623501155,2.509535608654829,3.6520241921439593,2.7357293459939838,3.7219713080774572,1.79978608558576,2.1490013130808725,3.0506491504672213,4.282604805576632,2.3453302124452065,4.765552217918703,2.576396589784987,2.2122195494057304,3.777122545381258,1.2790576486032348,2.6425654857274985,3.212018531982713,5.001124074462854,3.1785870770834213,3.9658649468050955,3.991180264704203,1.950852602272741,3.7912958928979705,1.739948405239964,2.9715486731415477,2.6257571982670096,1.826757810204818,3.3903795303152915,2.641366174472212,2.268682514813932,2.302301777328137,1.318053862077372,1.4887538557215252,4.596202655233997,2.708452851966949,2.535659883929069,2.159712854763802,4.3592175241886935,1.2835999065428711,3.8212126122949757,0.9923739981417254,1.2082442788719718,3.09057936644558,2.0470303995207977,2.281734506191289,4.088998687981933,3.3761246055228167,1.5022126296834528,1.6766482086025092,3.0925674715305713,1.6349647008447825,3.204433721898702,2.127761369019482,3.4994059501116883,4.407179021668299,3.257583684085755,3.313665712868691,2.0566775385443052,4.3593727689426345,2.521146674716752,3.261467251430135,1.0213751954482944,3.181153154383373,4.502250899343585,3.2313429276554966,2.8643146434649607,2.6234099229109504,3.7240089562669496,2.7780075804328814,2.709132808617267,3.2789299190580423,2.721715981494141,4.2277698250773055,3.979624410666172,3.3718544694471473,0.7743217015416155,3.61345808170633,2.3576880804445057,3.6785246541125773])
        //RDD 的 randomSplit 方法用于将数据集分成 训练集和测试集
        val splitdta = data.randomSplit(Array(1,9))
        val trainData = splitdta(1)
        val testData = splitdta(0)
    
        //逻辑回归训练 得到model
    
        val model: LogisticRegressionModel = LogisticRegressionWithSGD.train(trainData,100,0.8)
    
        //测试数据库标签
        val lable = testData.map(_.label)
        //测试数据集特征值
        val feature = testData.map(_.features)
        //计算每个样本的score 得分
        val score = feature.map(f =>{
          val margin = dot(model.weights, f) + model.intercept
          val sco = 1.0/(1.0+math.exp(-margin))
          sco
        })
        //得到得分后zip 上 label 并排序 ,index 就是rank 值
        val scoreLabel: RDD[((Double, Double), Long)] = score.zip(lable).sortBy(_._1).zipWithIndex()
        scoreLabel.cache()
        //scoreLabel.take(4).foreach(print)
    
        //正例返回rank 负例返回o ,最后sum 的值就是正例的rank 值之和
        val rank = scoreLabel.map(sample=>if(sample._1._2.equals(1.0)){
          sample._2
        }else{
    
          0.0
        }).sum()
        //标签中正例值 1.0  - 0.0 sum 的结果就是正例的个数
        val M = lable.sum()
        val N =lable.count()-M
        //带入公式计算auc
        val auc = (rank -M*(M+1)/2)/(M*N)
    
        println("Auc==" + auc)
    
    
        //data.foreach(println)
    
      }
    
    }



    结果:
    "C:Program FilesJavajdk1.7.0_80injava" -Didea.launcher.port=7532 "-Didea.launcher.bin.path=C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3in" -Dfile.encoding=UTF-8 -classpath "C:Program FilesJavajdk1.7.0_80jrelibcharsets.jar;C:Program FilesJavajdk1.7.0_80jrelibdeploy.jar;C:Program FilesJavajdk1.7.0_80jrelibextaccess-bridge-64.jar;C:Program FilesJavajdk1.7.0_80jrelibextdnsns.jar;C:Program FilesJavajdk1.7.0_80jrelibextjaccess.jar;C:Program FilesJavajdk1.7.0_80jrelibextlocaledata.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunec.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunjce_provider.jar;C:Program FilesJavajdk1.7.0_80jrelibextsunmscapi.jar;C:Program FilesJavajdk1.7.0_80jrelibextzipfs.jar;C:Program FilesJavajdk1.7.0_80jrelibjavaws.jar;C:Program FilesJavajdk1.7.0_80jrelibjce.jar;C:Program FilesJavajdk1.7.0_80jrelibjfr.jar;C:Program FilesJavajdk1.7.0_80jrelibjfxrt.jar;C:Program FilesJavajdk1.7.0_80jrelibjsse.jar;C:Program FilesJavajdk1.7.0_80jrelibmanagement-agent.jar;C:Program FilesJavajdk1.7.0_80jrelibplugin.jar;C:Program FilesJavajdk1.7.0_80jrelib
    esources.jar;C:Program FilesJavajdk1.7.0_80jrelib
    t.jar;D:igdataworkspaces
    ecommderoutproduction
    ecommder;F:scalalibscala-actors-migration.jar;F:scalalibscala-actors.jar;F:scalalibscala-library.jar;F:scalalibscala-reflect.jar;F:scalalibscala-swing.jar;D:igdataworkspaces
    ecommderlibspark-assembly-1.6.0-hadoop2.6.0.jar;C:Program Files (x86)JetBrainsIntelliJ IDEA Community Edition 2016.1.3libidea_rt.jar" com.intellij.rt.execution.application.AppMain org.apache.spark.mllib.classification.auc
    Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
    16/09/24 15:21:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    16/09/24 15:21:05 INFO Slf4jLogger: Slf4jLogger started
    16/09/24 15:21:05 INFO Remoting: Starting remoting
    16/09/24 15:21:06 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.1.171:16869]
    16/09/24 15:21:11 WARN : Your hostname, root resolves to a loopback/non-reachable address: fe80:0:0:0:0:5efe:c0a8:8c01%17, but we couldn't find any external IP address!
    16/09/24 15:21:23 INFO FileInputFormat: Total input paths to process : 1
    16/09/24 15:21:24 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
    16/09/24 15:21:24 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
    16/09/24 15:21:24 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
    16/09/24 15:21:24 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
    16/09/24 15:21:24 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
    16/09/24 15:21:55 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
    16/09/24 15:21:55 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
    [Stage 209:===============================================>         (5 + 1) / 6]Auc==0.9998032270759544
    
    Process finished with exit code 0

  • 相关阅读:
    Proj THUDBFuzz Paper Reading: The Art, Science, and Engineering of Fuzzing: A Survey
    Proj THUDBFuzz Paper Reading: A systematic review of fuzzing based on machine learning techniques
    9.3 付费代理的使用
    11.1 Charles 的使用
    第十一章 APP 的爬取
    10.2 Cookies 池的搭建
    10.1 模拟登录并爬取 GitHub
    11.5 Appium 爬取微信朋友圈
    11.4 Appium 的基本使用
    11.3 mitmdump 爬取 “得到” App 电子书信息
  • 原文地址:https://www.cnblogs.com/TendToBigData/p/10501368.html
Copyright © 2011-2022 走看看