zoukankan      html  css  js  c++  java
  • AAS代码运行-第4章

    [root@node1 aas]# ls
    ch02  ch03  spark-1.2.1-bin-hadoop2.4  spark-1.2.1-bin-hadoop2.4.tgz
    [root@node1 aas]# cd spark-1.2.1-bin-hadoop2.4
    [root@node1 spark-1.2.1-bin-hadoop2.4]# cd ..
    [root@node1 aas]# mkdir ch04
    [root@node1 aas]# cd ch04
    [root@node1 ch04]# ls
    [root@node1 ch04]# wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
    --2015-12-06 08:52:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
    Resolving archive.ics.uci.edu... 128.195.10.249
    Connecting to archive.ics.uci.edu|128.195.10.249|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 11240707 (11M) [application/x-gzip]
    Saving to: ?.ovtype.data.gz?
    
    100%[===============================================================================================================================================================>] 11,240,707  2.62M/s   in 4.2s    
    
    2015-12-06 08:52:39 (2.53 MB/s) - ?.ovtype.data.gz?.saved [11240707/11240707]
    
    [root@node1 ch04]# wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
    --2015-12-06 08:53:00--  https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
    Resolving archive.ics.uci.edu... 128.195.10.249
    Connecting to archive.ics.uci.edu|128.195.10.249|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 14610 (14K) [text/plain]
    Saving to: ?.ovtype.info?
    
    100%[===============================================================================================================================================================>] 14,610      --.-K/s   in 0.001s  
    
    2015-12-06 08:53:01 (15.6 MB/s) - ?.ovtype.info?.saved [14610/14610]
    
    [root@node1 ch04]# wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/old_covtype.info
    --2015-12-06 08:53:25--  https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/old_covtype.info
    Resolving archive.ics.uci.edu... 128.195.10.249
    Connecting to archive.ics.uci.edu|128.195.10.249|:443... connected.
    HTTP request sent, awaiting response... 200 OK
    Length: 4847 (4.7K) [text/plain]
    Saving to: ?.ld_covtype.info?
    
    100%[===============================================================================================================================================================>] 4,847       --.-K/s   in 0s      
    
    2015-12-06 08:53:26 (12.7 MB/s) - ?.ld_covtype.info?.saved [4847/4847]

    将数据放到HDFS上

    [root@node1 ch04]# ls
    covtype.data.gz  covtype.info  old_covtype.info
    [root@node1 ch04]# gunzip -d covtype.data.gz 
    [root@node1 ch04]# ll
    total 73432
    -rw-r--r-- 1 root root 75169317 Sep  1  1998 covtype.data
    -rw-r--r-- 1 root root    14610 Apr 18  2010 covtype.info
    -rw-r--r-- 1 root root     4847 Sep  1  1998 old_covtype.info
    [root@node1 ch04]# hdfs dfs -mkdir /user/root/covtype
    [root@node1 ch04]# hdfs dfs -put * /user/root/covtype
    [root@node1 ch04]# hdfs dfs -ls /user/root/covtype
    Found 3 items
    -rw-r--r--   3 root supergroup   75169317 2015-12-06 09:02 /user/root/covtype/covtype.data
    -rw-r--r--   3 root supergroup      14610 2015-12-06 09:02 /user/root/covtype/covtype.info
    -rw-r--r--   3 root supergroup       4847 2015-12-06 09:02 /user/root/covtype/old_covtype.info

    启动spark-shell

    [root@node1 ch04]# ../spark-1.2.1-bin-hadoop2.4/bin/spark-shell --master yarn-client
    Welcome to
          ____              __
         / __/__  ___ _____/ /__
        _ / _ / _ `/ __/  '_/
       /___/ .__/\_,_/_/ /_/\_   version 1.2.1
          /_/
    
    Using Scala version 2.10.4 (OpenJDK 64-Bit Server VM, Java 1.7.0_09-icedtea)
    Type in expressions to have them evaluated.
    Type :help for more information.
    15/12/06 09:08:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
    Spark context available as sc.

    运行代码4.7节代码:

    scala> import org.apache.spark.mllib.linalg._
    import org.apache.spark.mllib.linalg._
    
    scala> import org.apache.spark.mllib.regression._
    import org.apache.spark.mllib.regression._
    
    scala> 
    
    scala> val rawData = sc.textFile("hdfs:///user/ds/covtype.data" )
    rawData: org.apache.spark.rdd.RDD[String] = hdfs:///user/ds/covtype.data MappedRDD[1] at textFile at <console>:18
    
    scala> val data = rawData.map { line =>
         | val values = line.split(',' ).map(_. toDouble)
         |   val featureVector = Vectors.dense(values. init)
         |   val label = values.last - 1
         |   LabeledPoint(label, featureVector)
         | }
    data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MappedRDD[2] at map at <console>:20
    
    scala> val Array(trainData, cvData, testData) = data. randomSplit(Array(0.8, 0.1, 0.1))
    trainData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = PartitionwiseSampledRDD[3] at randomSplit at <console>:22
    cvData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = PartitionwiseSampledRDD[4] at randomSplit at <console>:22
    testData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = PartitionwiseSampledRDD[5] at randomSplit at <console>:22
    
    scala> trainData.cache()
    res0: trainData.type = PartitionwiseSampledRDD[3] at randomSplit at <console>:22
    
    scala> cvData.cache()
    res1: cvData.type = PartitionwiseSampledRDD[4] at randomSplit at <console>:22
    
    scala> testData.cache()
    res2: testData.type = PartitionwiseSampledRDD[5] at randomSplit at <console>:22

    scala> import org.apache.spark.mllib.evaluation._
    import org.apache.spark.mllib.evaluation._

    
    

    scala> import org.apache.spark.mllib.tree._
    import org.apache.spark.mllib.tree._

    
    

    scala> import org.apache.spark.mllib.tree.model._
    import org.apache.spark.mllib.tree.model._

    
    

    scala> import org.apache.spark.rdd._
    import org.apache.spark.rdd._

    
    

    scala> def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]):
    | MulticlassMetrics = {
    | val predictionsAndLabels = data. map(example =>(model. predict(example. features), example. label))
    | new MulticlassMetrics(predictionsAndLabels)
    | }
    getMetrics: (model: org.apache.spark.mllib.tree.model.DecisionTreeModel, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])org.apache.spark.mllib.evaluation.MulticlassMetrics

    
    

    scala> val model = DecisionTree.trainClassifier(trainData, 7, Map[Int, Int](), "gini" , 4, 100)
    model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 4 with 31 nodes

    
    

    scala> val metrics = getMetrics(model, cvData)
    metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@5d574c23

    
    

    scala> metrics. confusionMatrix
    res6: org.apache.spark.mllib.linalg.Matrix =
    15535.0 5345.0 21.0 0.0 0.0 0.0 392.0
    6669.0 20855.0 688.0 0.0 5.0 0.0 47.0
    0.0 610.0 2942.0 0.0 0.0 0.0 0.0
    0.0 0.0 274.0 0.0 0.0 0.0 0.0
    12.0 874.0 57.0 0.0 15.0 0.0 0.0
    0.0 446.0 1318.0 0.0 0.0 0.0 0.0
    1150.0 19.0 8.0 0.0 0.0 0.0 905.0

    
    

    scala> metrics.precision
    res7: Double = 0.6917696392665028

    scala> metrics.precision
    res7: Double = 0.6917696392665028

    scala> (0 until 7).map(
    | cat => (metrics.precision(cat), metrics.recall(cat))
    | ).foreach(println)
    (0.6648549174013524,0.729582491898746)
    (0.7408788944545099,0.7378644211718086)
    (0.554257724189902,0.8282657657657657)
    (0.0,0.0)
    (0.75,0.015657620041753653)
    (0.0,0.0)
    (0.6733630952380952,0.4346781940441883)

    scala> import org.apache.spark.rdd._
    import org.apache.spark.rdd._

    scala> def classProbabilities(data: RDD[LabeledPoint]): Array[Double] = {
    | val countsByCategory = data.map(_.label).countByValue()
    | val counts = countsByCategory.toArray.sortBy(_. _1).map(_. _2)
    | counts.map(_.toDouble / counts.sum)
    | }
    classProbabilities: (data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])Array[Double]

    scala> val trainPriorProbabilities = classProbabilities(trainData)
    trainPriorProbabilities: Array[Double] = Array(0.3644680841907762, 0.48778063233452534, 0.06163475731247069, 0.004682046846288574, 0.0163893156379504, 0.029860958700732, 0.035184204977256786)

    scala> val cvPriorProbabilities = classProbabilities(cvData)
    cvPriorProbabilities: Array[Double] = Array(0.36594084589341264, 0.4857442384037672, 0.061044563218588345, 0.004708955608641105, 0.016464158660869265, 0.03031604997679894, 0.03578118823792256)

    scala> trainPriorProbabilities.zip(cvPriorProbabilities).map {
    | case (trainProb, cvProb) => trainProb * cvProb
    | }.sum
    res9: Double = 0.3765289404519721

    scala> val evaluations =
    | for (impurity <- Array("gini" , "entropy" );
    | depth <- Array(1, 20);
    | bins <- Array(10, 300))
    | yield {
    | val model = DecisionTree. trainClassifier(trainData, 7, Map[Int, Int](), impurity, depth, bins)
    | val predictionsAndLabels = cvData. map(example =>(model. predict(example. features), example. label))
    | val accuracy = new MulticlassMetrics(predictionsAndLabels). precision
    | ((impurity, depth, bins), accuracy)
    | }
    evaluations: Array[((String, Int, Int), Double)] = Array(((gini,1,10),0.6319968377816351), ((gini,1,300),0.6323577431385017), ((gini,20,10),0.889253613350061), ((gini,20,300),0.9074191829790159), ((entropy,1,10),0.4857442384037672), ((entropy,1,300),0.4857442384037672), ((entropy,20,10),0.8946500077336862), ((entropy,20,300),0.9099455204770825))

    scala> evaluations.sortBy(_. _2).reverse.foreach(println)
    ((entropy,20,300),0.9099455204770825)
    ((gini,20,300),0.9074191829790159)
    ((entropy,20,10),0.8946500077336862)
    ((gini,20,10),0.889253613350061)
    ((gini,1,300),0.6323577431385017)
    ((gini,1,10),0.6319968377816351)
    ((entropy,1,300),0.4857442384037672)
    ((entropy,1,10),0.4857442384037672)

    scala> val data = rawData.map { line =>
    | val values = line.split(',' ).map(_.toDouble)
    | val wilderness = values.slice(10, 14).indexOf(1.0).toDouble
    | val soil = values.slice(14, 54).indexOf(1.0).toDouble
    | val featureVector =
    | Vectors.dense(values.slice(0, 10) :+ wilderness :+ soil)
    | val label = values.last - 1
    | LabeledPoint(label, featureVector)
    | }
    data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MappedRDD[391] at map at <console>:47

    scala>

    scala> val evaluations =
    | for (impurity <- Array("gini" , "entropy" );
    | depth <- Array(10, 20, 30);
    | bins <- Array(40, 300))
    | yield {
    | val model = DecisionTree. trainClassifier(trainData, 7, Map(10 -> 4, 11 -> 40),impurity, depth, bins)
    | val trainAccuracy = getMetrics(model, trainData). precision
    | val cvAccuracy = getMetrics(model, cvData). precision
    | ((impurity, depth, bins), (trainAccuracy, cvAccuracy))
    | }
    evaluations: Array[((String, Int, Int), (Double, Double))] = Array(((gini,10,40),(0.7772542032989496,0.7730420884389984)), ((gini,10,300),(0.7849615065174265,0.7793665251688522)), ((gini,20,40),(0.9393033733975393,0.904480382215959)), ((gini,20,300),(0.9421715574260792,0.904480382215959)), ((gini,30,40),(0.9972329447406585,0.9341089934177738)), ((gini,30,300),(0.9974352022790551,0.9347964321927579)), ((entropy,10,40),(0.7768755083334409,0.7716672108890302)), ((entropy,10,300),(0.7715307452975122,0.7655318198222971)), ((entropy,20,40),(0.9487578374796128,0.9103407977726984)), ((entropy,20,300),(0.9484781196073622,0.9088971763452317)), ((entropy,30,40),(0.998582045555283,0.9374430714764467)), ((entropy,30,300),(0.9990833860493938,0.9413786584632307)))

    scala> val forest = RandomForest. trainClassifier(
    | trainData, 7, Map(10 -> 4, 11 -> 40), 20,
    | "auto" , "entropy" , 30, 300)
    forest: org.apache.spark.mllib.tree.model.RandomForestModel =
    TreeEnsembleModel classifier with 20 trees

     
  • 相关阅读:
    安卓iOS软件免编程开发步骤
    我的站被收录了两次,被封掉两次,怎么办?
    百度算法调整 优化网站该从哪几方面着手?
    如何成为php编程高手
    百度等搜索引擎寻找各种数据库网站的方法
    【网站抓取】如何正确识别Baiduspider移动ua
    EditPlus 技巧大全:[1]怎么配置PHP编译环境
    如何本地搭建php环境用来测试PHP程序教程!
    用最简洁的代码构建MVC分层
    BMR计算器2——python第三课(字符串分割,格式化输出,异常处理)
  • 原文地址:https://www.cnblogs.com/littlesuccess/p/5023148.html
Copyright © 2011-2022 走看看