zoukankan      html  css  js  c++  java
  • 2020.02.15

    1.数据导入

    从文件中导入数据,并转化为 DataFrame。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    import org.apache.spark.ml.feature.PCA
    import org.apache.spark.sql.Row
    import org.apache.spark.ml.linalg.{Vector,Vectors}
    import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
    import org.apache.spark.ml.{Pipeline,PipelineModel}
    import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer}
    import org.apache.spark.ml.classification.LogisticRegression
    import org.apache.spark.ml.classification.LogisticRegressionModel
    import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression}
    import org.apache.spark.sql.functions;
    scala> import spark.implicits._
    import spark.implicits._
    scala> case class Adult(features: org.apache.spark.ml.linalg.Vector, label: String) <br>defined class Adult
    scala> val df=sc.textFile("adult.data.txt").map(_.split(",")).map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble,p(10).toDouble,p(11).toDouble,p(12).toDouble),p(14).toString())).toDF()<br>df:org.apache.spark.sql.DataFrame = [features: vector, label: string]
    scala> val test = sc.textFile("adult.test.txt").map(_.split(",")).map(p=>Adult(Vectors.dense(p(0).toDouble,p(2).toDouble,p(4).toDouble, p(10).toDouble, p(11).toDouble, p(12).toDouble), p(14).toString())).toDF() <br>test:org.apache.spark.sql.DataFrame = [features: vector, label: string]

    2.进行主成分分析(PCA)

     对 6 个连续型的数值型变量进行主成分分析。PCA(主成分分析)是通过正交变换把一 组相关变量的观测值转化成一组线性无关的变量值,即主成分的一种方法。PCA 通过使用 主成分把特征向量投影到低维空间,实现对特征向量的降维。请通过 setK()方法将主成分数 量设置为 3,把连续型的特征向量转化成一个 3 维的主成分。

    1
    2
    3
    4
    5
    scala> val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(3).fit(df)
    scala> val result = pca.transform(df)
    scala> val testdata = pca.transform(test)
    scala> result.show(false)
    scala> testdata.show(false)

    3.训练分类模型并预测居民收入

    在主成分分析的基础上,采用逻辑斯蒂回归,或者决策树模型预测居民收入是否超过 50K;对 Test 数据集进行验证。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    scala> val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(result)
    scala> labelIndexer.labels.foreach(println)
    scala> val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures").fit(result)
    scala> println(featureIndexer.numFeatures)
    scala> val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer. labels)
    scala> val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter( 100)
    scala> val lrPipeline = new Pipeline().setStages(Array(labelIndexer, featureIndexer, lr, labelConverter))
    scala> val lrPipelineModel = lrPipeline.fit(result)
    scala> val lrModel = lrPipelineModel.stages(2).asInstanceOf[LogisticRegressionModel]
    scala> println("Coefficients: " + lrModel.coefficientMatrix+"Intercept: "+lrModel.interceptVector+"numClasses: "+lrModel.numClasses+"numFeatures: "+lrModel.numFeatures)
    scala> val lrPredictions = lrPipelineModel.transform(testdata)
    scala> val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
    scala> val lrAccuracy = evaluator.evaluate(lrPredictions)
    scala> println("Test Error = " + (1.0 - lrAccuracy))

    4.超参数调优

    利用 CrossValidator 确定最优的参数,包括最优主成分 PCA 的维数、分类器自身的参数 等。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    scala> val pca = new PCA().setInputCol("features").setOutputCol("pcaFeatures")
    scala> val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
    scala> val featureIndexer = new VectorIndexer().setInputCol("pcaFeatures").setOutputCol("indexedFeatures")
    scala> val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.l abels)
    scala> val lr = new LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(1 00)
    scala> val lrPipeline = new Pipeline().setStages(Array(pca, labelIndexer, featureIndexer, lr, labelConverter))
    scala> val paramGrid = new ParamGridBuilder().addGrid(pca.k, Array(1,2,3,4,5,6)).addGrid(lr.elasticNetParam, Array(0.2,0.8)).addGrid(lr.regParam, Array(0.01, 0.1, 0.5)).build()
    scala> val cv = new CrossValidator().setEstimator(lrPipeline).setEvaluator(new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")).se tEstimatorParamMaps(paramGrid).setNumFolds(3)
    scala> val cvModel = cv.fit(df)
    scala> val lrPredictions=cvModel.transform(test)
    scala> val evaluator = new MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
    scala> val lrAccuracy = evaluator.evaluate(lrPredictions)
    scala> println("准确率为"+lrAccuracy)
    scala> val bestModel= cvModel.bestModel.asInstanceOf[PipelineModel]
    scala> val lrModel = bestModel.stages(3).asInstanceOf[LogisticRegressionModel]
    scala> println("Coefficients: " + lrModel.coefficientMatrix + "Intercept: "+lrModel.interceptVector+ "numClasses: "+lrModel.numClasses+"numFeatures: "+lrModel.numFeatures)
    scala> val pcaModel = bestModel.stages(0).asInstanceOf[PCAModel]
    scala> println("Primary Component: " + pcaModel.pc)
  • 相关阅读:
    MarkDown测试
    在Tabbed Activity(ViewPager)中切换Fragment
    About ListView
    Android Studio的技巧
    卷积神经网络
    TensorFlow中CNN的两种padding方式“SAME”和“VALID”
    tensorflow-解决3个问题
    激活函数
    tensorflow数学运算
    tensorflow
  • 原文地址:https://www.cnblogs.com/zql98/p/12311287.html
Copyright © 2011-2022 走看看