zoukankan      html  css  js  c++  java
  • Spark机器学习读书笔记-CH05

    5.2.从数据中提取合适的特征

    [root@demo1 ch05]# sed 1d train.tsv > train_noheader.tsv
    [root@demo1 ch05]# ll
    total 42920
    -rw-r--r-- 1 root root 21972457 Jan 31 15:03 train_noheader.tsv
    -rw-r--r-- 1 root root 21972916 Jan 31 15:00 train.tsv
    [root@demo1 ch05]# hdfs dfs -mkdir /user/root/studio/MachineLearningWithSpark/ch05
    [root@demo1 ch05]# hdfs dfs -put train_noheader.tsv /user/root/studio/MachineLearningWithSpark/ch05

    [root@demo1 ch05]# spark-shell --master yarn

    scala> val rawData = sc.textFile("/user/root/studio/MachineLearningWithSpark/ch05/train_noheader.tsv")
    rawData: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[1] at textFile at <console>:27

    scala> val records = rawData.map(line => line.split(" "))
    records: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[2] at map at <console>:29

    scala> records.first()
    res1: Array[String] = Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", "{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees ...
    scala> import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.regression.LabeledPoint

    scala> import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.linalg.Vectors

    scala> val data = records.map{ r =>
    | val trimmed = r.map(_.replaceAll(""",""))
    | val label = trimmed(r.size - 1).toInt
    | val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
    | LabeledPoint(label, Vectors.dense(features))
    | }
    data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[3] at map at <console>:33

    5.3.训练分类模型

    scala> import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
    import org.apache.spark.mllib.classification.LogisticRegressionWithSGD

    scala> import org.apache.spark.mllib.classification.SVMWithSGD
    import org.apache.spark.mllib.classification.SVMWithSGD

    scala> import org.apache.spark.mllib.classification.NaiveBayes
    import org.apache.spark.mllib.classification.NaiveBayes

    scala> import org.apache.spark.mllib.tree.DecisionTree
    import org.apache.spark.mllib.tree.DecisionTree

    scala> import org.apache.spark.mllib.tree.configuration.Algo
    import org.apache.spark.mllib.tree.configuration.Algo

    scala> import org.apache.spark.mllib.tree.impurity.Entropy
    import org.apache.spark.mllib.tree.impurity.Entropy

    scala> val numIterations = 10
    numIterations: Int = 10

    scala> val maxTreeDepth = 5
    maxTreeDepth: Int = 5

    scala> val lrModel = LogisticRegressionWithSGD.train(data, numIterations)
    lrModel: org.apache.spark.mllib.classification.LogisticRegressionModel = org.apache.spark.mllib.classification.LogisticRegressionModel: intercept = 0.0, numFeatures = 22, numClasses = 2, threshold = 0.5

    scala> val svmModel = SVMWithSGD.train(data, numIterations)
    svmModel: org.apache.spark.mllib.classification.SVMModel = org.apache.spark.mllib.classification.SVMModel: intercept = 0.0, numFeatures = 22, numClasses = 2, threshold = 0.0

    scala> val nbModel = NaiveBayes.train(nbData)
    nbModel: org.apache.spark.mllib.classification.NaiveBayesModel = org.apache.spark.mllib.classification.NaiveBayesModel@42cf75c1

    scala> val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)
    dtModel: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 5 with 61 nodes

    5.4使用分类模型

    scala> val dataPoint = data.first
    dataPoint: org.apache.spark.mllib.regression.LabeledPoint = (0.0,[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

    scala> val prediction = lrModel.predict(dataPoint.features)
    prediction: Double = 1.0

    scala> val trueLabel = dataPoint.label
    trueLabel: Double = 0.0

    scala> val predictions = lrModel.predict(data.map(lp => lp.features))
    predictions: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[99] at mapPartitions at GeneralizedLinearAlgorithm.scala:69

    scala> predictions.take(5)
    res3: Array[Double] = Array(1.0, 1.0, 1.0, 1.0, 1.0)

    5.5.评估分类模型的性能

    scala> val lrTotalCorrect = data.map { point =>
    | if (lrModel.predict(point.features) == point.label) 1 else 0
    | }.sum
    lrTotalCorrect: Double = 3806.0

    scala> val lrAccuracy = lrTotalCorrect / data.count
    lrAccuracy: Double = 0.5146720757268425

    scala> val svmTotalCorrect = data.map { point =>
    | if (svmModel.predict(point.features) == point.label) 1 else 0
    | }.sum
    svmTotalCorrect: Double = 3806.0

    scala> val svmAccuracy = svmTotalCorrect / data.count
    svmAccuracy: Double = 0.5146720757268425

    scala> val nbTotalCorrect = nbData.map { point =>
    | if (nbModel.predict(point.features) == point.label) 1 else 0
    | }.sum
    nbTotalCorrect: Double = 4292.0

    scala> val nbAccuracy = nbTotalCorrect / data.count
    nbAccuracy: Double = 0.5803921568627451

    scala> val dtTotalCorrect = data.map { point =>
    | val score = dtModel.predict(point.features)
    | val predicted = if (score > 0.5) 1 else 0
    | if (predicted == point.label) 1 else 0
    | }.sum
    dtTotalCorrect: Double = 4794.0

    scala> val dtAccuracy = dtTotalCorrect / data.count
    dtAccuracy: Double = 0.6482758620689655

    scala> import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
    import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics

    scala> val metrics = Seq(lrModel, svmModel).map { model =>
    | val scoreAndLabels = data.map { point => (model.predict(point.features), point.label) }
    | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    | (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
    | }
    metrics: Seq[(String, Double, Double)] = List((LogisticRegressionModel,0.7567586293858841,0.5014181143280931), (SVMModel,0.7567586293858841,0.5014181143280931))

    scala> val nbMetrics = Seq(nbModel).map { model =>
    | val scoreAndLabels = nbData.map { point =>
    | val score = model.predict(point.features)
    | (if (score > 0.5) 1.0 else 0.0, point.label)
    | }
    | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    | (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
    | }
    nbMetrics: Seq[(String, Double, Double)] = List((NaiveBayesModel,0.6808510815151734,0.5835585110136261))

    scala> val dtMetrics = Seq(dtModel).map { model =>
    | val scoreAndLabels = data.map { point =>
    | val score = model.predict(point.features)
    | (if (score > 0.5) 1.0 else 0.0, point.label)
    | }
    | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    | (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
    | }
    dtMetrics: Seq[(String, Double, Double)] = List((DecisionTreeModel,0.7430805993331199,0.6488371887050935))

    scala> val allMetrics = metrics ++ nbMetrics ++ dtMetrics
    allMetrics: Seq[(String, Double, Double)] = List((LogisticRegressionModel,0.7567586293858841,0.5014181143280931), (SVMModel,0.7567586293858841,0.5014181143280931), (NaiveBayesModel,0.6808510815151734,0.5835585110136261), (DecisionTreeModel,0.7430805993331199,0.6488371887050935))

    scala> allMetrics.foreach { case (m, pr, roc) =>
    | println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%")
    | }
    LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
    SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
    NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%
    DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%

  • 相关阅读:
    SMTP 服务器要求安全连接或客户端未通过身份验证的各个解决方案(C#)
    远程数据同步的三种方法
    粗俗易懂的SQL存储过程在.NET中的实例运用之二
    SSIS脚本组件的代码
    浅谈C# StackTrace 类的实例说明
    解决了:无法加载文件或程序集'stdole, Version=7.0.3300.0'
    收藏: .NET中类型的转换
    WCF 实例 —— Android 短信助手 (WCF + Android)
    粗俗易懂的SQL存储过程在.NET中的实例运用
    此发送邮件的代码对吗?
  • 原文地址:https://www.cnblogs.com/littlesuccess/p/5173598.html
Copyright © 2011-2022 走看看