zoukankan      html  css  js  c++  java
  • Spark2 oneHot编码--标准化--主成分--聚类

    1.导入包

    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.Dataset
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.DataFrame
    import org.apache.spark.sql.Column
    import org.apache.spark.sql.DataFrameReader
    import org.apache.spark.rdd.RDD
    import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
    import org.apache.spark.sql.Encoder
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.DataFrameStatFunctions
    import org.apache.spark.ml.linalg.Vectors
    import org.apache.spark.ml.feature.StringIndexer
    import org.apache.spark.ml.feature.OneHotEncoder
    import org.apache.spark.ml.feature.VectorAssembler
    import org.apache.spark.ml.feature.MinMaxScaler
    import org.apache.spark.ml.feature.StandardScaler
    import org.apache.spark.ml.feature.PCA
    import org.apache.spark.ml.clustering.KMeans
    

    2.导入数据

       
    val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate() 
       
    // For implicit conversions like converting RDDs to DataFrames 
    import spark.implicits._
       
    val data: DataFrame = spark.read.format("csv").option("header", true).load("hdfs://ns1/datafile/wangxiao/Affairs.csv") 
    data: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields] 
       
    data.cache 
    res0: data.type = [affairs: string, gender: string ... 7 more fields] 
      
    data.limit(10).show() 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
    |      0|  male| 37|          10|      no|            3|       18|         7|     4| 
    |      0|female| 27|           4|      no|            4|       14|         6|     4| 
    |      0|female| 32|          15|     yes|            1|       12|         1|     4| 
    |      0|  male| 57|          15|     yes|            5|       18|         6|     5| 
    |      0|  male| 22|        0.75|      no|            2|       17|         6|     3| 
    |      0|female| 32|         1.5|      no|            2|       17|         5|     5| 
    |      0|female| 22|        0.75|      no|            2|       12|         1|     3| 
    |      0|  male| 57|          15|     yes|            2|       14|         4|     4| 
    |      0|female| 32|          15|     yes|            4|       16|         1|     2| 
    |      0|  male| 22|         1.5|      no|            4|       14|         4|     5| 
    +-------+------+---+------------+--------+-------------+---------+----------+------+ 
       
    // 转换字符类型,将Double和String的字段分开放 
    val data1 = data.select( 
         |   data("affairs").cast("Double"), 
         |   data("age").cast("Double"), 
         |   data("yearsmarried").cast("Double"), 
         |   data("religiousness").cast("Double"), 
         |   data("education").cast("Double"), 
         |   data("occupation").cast("Double"), 
         |   data("rating").cast("Double"), 
         |   data("gender").cast("String"), 
         |   data("children").cast("String")) 
    data1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields] 
       
    data1.printSchema() 
    root 
     |-- affairs: double (nullable = true) 
     |-- age: double (nullable = true) 
     |-- yearsmarried: double (nullable = true) 
     |-- religiousness: double (nullable = true) 
     |-- education: double (nullable = true) 
     |-- occupation: double (nullable = true) 
     |-- rating: double (nullable = true) 
     |-- gender: string (nullable = true) 
     |-- children: string (nullable = true) 
       
       
    data1.limit(10).show 
    +-------+----+------------+-------------+---------+----------+------+------+--------+ 
    |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+ 
    |    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no| 
    |    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes| 
    |    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes| 
    |    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no| 
    |    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no| 
    |    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no| 
    |    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes| 
    |    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes| 
    |    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+ 
       
    val dataDF = data1
    dataDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields] 
       
    dataDF.cache() 
    res4: dataDF.type = [affairs: double, age: double ... 7 more fields] 
    

    3.字符转换成数字索引,OneHot编码,注意setDropLast设置为false

    字符转换成数字索引 
    val indexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex").fit(dataDF) 
    indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_27dba613193a 
       
    val indexed = indexer.transform(dataDF) 
    indexed: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 8 more fields] 
       
    // OneHot编码,注意setDropLast设置为false 
    val encoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec").setDropLast(false) 
    encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_155a53de3aef 
       
    val encoded = encoder.transform(indexed) 
    encoded: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 9 more fields] 
       
    encoded.show() 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ 
    |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ 
    |    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])| 
    |    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])| 
    |    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])| 
    |    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])| 
    |    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])| 
    |    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ 
    only showing top 20 rows 
      
    val indexer1 = new StringIndexer().setInputCol("children").setOutputCol("childrenIndex").fit(encoded) 
    indexer1: org.apache.spark.ml.feature.StringIndexerModel = strIdx_55db099c07b7
       
    val indexed1 = indexer1.transform(encoded) 
    indexed1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 10 more fields] 
       
    val encoder1 = new OneHotEncoder().setInputCol("childrenIndex").setOutputCol("childrenVec").setDropLast(false) 
       
    val encoded1 = encoder1.transform(indexed1) 
    encoded1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields] 
       
    encoded1.show() 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec|childrenIndex|  childrenVec| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    |    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    only showing top 20 rows 
      
       
    val encodeDF: DataFrame = encoded1
    encodeDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields] 
       
    encodeDF.show() 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex|    genderVec|childrenIndex|  childrenVec| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    |    0.0|37.0|        10.0|          3.0|     18.0|       7.0|   4.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       6.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|32.0|        15.0|          1.0|     12.0|       1.0|   4.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|57.0|        15.0|          5.0|     18.0|       6.0|   5.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     17.0|       6.0|   3.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|32.0|         1.5|          2.0|     17.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|22.0|        0.75|          2.0|     12.0|       1.0|   3.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|57.0|        15.0|          2.0|     14.0|       4.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|32.0|        15.0|          4.0|     16.0|       1.0|   2.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          4.0|     14.0|       4.0|   5.0|  male|      no|        1.0|(2,[1],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     20.0|       7.0|   2.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     18.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|47.0|        15.0|          5.0|     17.0|       6.0|   4.0|  male|     yes|        1.0|(2,[1],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     17.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|         4.0|          4.0|     14.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|37.0|        15.0|          1.0|     17.0|       5.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|37.0|        15.0|          2.0|     18.0|       4.0|   3.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    |    0.0|22.0|        0.75|          3.0|     16.0|       5.0|   4.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|22.0|         1.5|          2.0|     16.0|       5.0|   5.0|female|      no|        0.0|(2,[0],[1.0])|          1.0|(2,[1],[1.0])| 
    |    0.0|27.0|        10.0|          2.0|     14.0|       1.0|   5.0|female|     yes|        0.0|(2,[0],[1.0])|          0.0|(2,[0],[1.0])| 
    +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ 
    only showing top 20 rows 
       
       
    encodeDF.printSchema() 
    root 
     |-- affairs: double (nullable = true) 
     |-- age: double (nullable = true) 
     |-- yearsmarried: double (nullable = true) 
     |-- religiousness: double (nullable = true) 
     |-- education: double (nullable = true) 
     |-- occupation: double (nullable = true) 
     |-- rating: double (nullable = true) 
     |-- gender: string (nullable = true) 
     |-- children: string (nullable = true) 
     |-- genderIndex: double (nullable = true) 
     |-- genderVec: vector (nullable = true) 
     |-- childrenIndex: double (nullable = true) 
     |-- childrenVec: vector (nullable = true) 
    

    4.将字段组合成向量feature

    //将字段组合成向量feature 
    val assembler = new VectorAssembler().setInputCols(Array("affairs", "age", "yearsmarried", "religiousness", "education", "occupation", "rating", "genderVec", "childrenVec")).setOutputCol("features") 
    assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_df76d5d1e3f4
       
    val vecDF: DataFrame = assembler.transform(encodeDF) 
    vecDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 12 more fields] 
       
    vecDF.select("features").show 
    +--------------------+ 
    |            features| 
    +--------------------+ 
    |[0.0,37.0,10.0,3....| 
    |[0.0,27.0,4.0,4.0...| 
    |[0.0,32.0,15.0,1....| 
    |[0.0,57.0,15.0,5....| 
    |[0.0,22.0,0.75,2....| 
    |[0.0,32.0,1.5,2.0...| 
    |[0.0,22.0,0.75,2....| 
    |[0.0,57.0,15.0,2....| 
    |[0.0,32.0,15.0,4....| 
    |[0.0,22.0,1.5,4.0...| 
    |[0.0,37.0,15.0,2....| 
    |[0.0,27.0,4.0,4.0...| 
    |[0.0,47.0,15.0,5....| 
    |[0.0,22.0,1.5,2.0...| 
    |[0.0,27.0,4.0,4.0...| 
    |[0.0,37.0,15.0,1....| 
    |[0.0,37.0,15.0,2....| 
    |[0.0,22.0,0.75,3....| 
    |[0.0,22.0,1.5,2.0...| 
    |[0.0,27.0,10.0,2....| 
    +--------------------+ 
    only showing top 20 rows 
    

    5.标准化--均值标准差

    // 标准化--均值标准差 
    val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(true) 
    scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_43d3da1cd3bf 
       
    // Compute summary statistics by fitting the StandardScaler. 
    val scalerModel = scaler.fit(vecDF) 
    scalerModel: org.apache.spark.ml.feature.StandardScalerModel = stdScal_43d3da1cd3bf 
       
    // Normalize each feature to have unit standard deviation. 
    val scaledData: DataFrame = scalerModel.transform(vecDF) 
    scaledData: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 13 more fields] 
       
    scaledData.select("features", "scaledFeatures").show 
    +--------------------+--------------------+ 
    |            features|      scaledFeatures| 
    +--------------------+--------------------+ 
    |[0.0,37.0,10.0,3....|[-0.4413500298573...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...| 
    |[0.0,32.0,15.0,1....|[-0.4413500298573...| 
    |[0.0,57.0,15.0,5....|[-0.4413500298573...| 
    |[0.0,22.0,0.75,2....|[-0.4413500298573...| 
    |[0.0,32.0,1.5,2.0...|[-0.4413500298573...| 
    |[0.0,22.0,0.75,2....|[-0.4413500298573...| 
    |[0.0,57.0,15.0,2....|[-0.4413500298573...| 
    |[0.0,32.0,15.0,4....|[-0.4413500298573...| 
    |[0.0,22.0,1.5,4.0...|[-0.4413500298573...| 
    |[0.0,37.0,15.0,2....|[-0.4413500298573...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...| 
    |[0.0,47.0,15.0,5....|[-0.4413500298573...| 
    |[0.0,22.0,1.5,2.0...|[-0.4413500298573...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...| 
    |[0.0,37.0,15.0,1....|[-0.4413500298573...| 
    |[0.0,37.0,15.0,2....|[-0.4413500298573...| 
    |[0.0,22.0,0.75,3....|[-0.4413500298573...| 
    |[0.0,22.0,1.5,2.0...|[-0.4413500298573...| 
    |[0.0,27.0,10.0,2....|[-0.4413500298573...| 
    +--------------------+--------------------+ 
    only showing top 20 rows 
    

    6.主成分PCA

    // 主成分 
    val pca = new PCA().setInputCol("scaledFeatures").setOutputCol("pcaFeatures").setK(3).fit(scaledData) 
       
    pca.explainedVariance.values //解释变量方差 
    res11: Array[Double] = Array(0.28779526464781313, 0.23798543640278289, 0.11742828783633019) 
       
    pca.pc //载荷(观测变量与主成分的相关系数) 
    res12: org.apache.spark.ml.linalg.DenseMatrix =
    -0.12034310848156521  0.05153952289637974   0.6678769450480689
    -0.42860623714516627  0.05417889891307473   -0.05592377098140197
    -0.44404074412877986  0.1926596811059294    -0.017025575192258197
    -0.12233707317255231  0.08053139375662526   -0.5093149296300096
    -0.14664751606128462  -0.3872166556211308   -0.03406819489501708
    -0.145543746024348    -0.43054860653839705  0.07841454709046872
    0.17703994181974803   -0.12792784984216296  -0.5173229755329072
    0.2459668445061567    0.4915809641798787    0.010477548320795945
    -0.2459668445061567   -0.4915809641798787   -0.010477548320795945
    -0.44420980045271047  0.240652448514566     -0.089356723885704
    0.4442098004527103    -0.24065244851456588  0.08935672388570405
       
    pca.extractParamMap() 
    res13: org.apache.spark.ml.param.ParamMap =
    { 
        pca_40a453a54776-inputCol: scaledFeatures, 
        pca_40a453a54776-k: 3, 
        pca_40a453a54776-outputCol: pcaFeatures 
    } 
       
    pca.params 
    res14: Array[org.apache.spark.ml.param.Param[_]] = Array(pca_40a453a54776__inputCol, pca_40a453a54776__k, pca_40a453a54776__outputCol) 
       
      
       
    val pcaDF: DataFrame = pca.transform(scaledData) 
    pcaDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 14 more fields] 
       
    pcaDF.cache() 
    res15: pcaDF.type = [affairs: double, age: double ... 14 more fields] 
       
       
    pcaDF.printSchema() 
    root 
     |-- affairs: double (nullable = true) 
     |-- age: double (nullable = true) 
     |-- yearsmarried: double (nullable = true) 
     |-- religiousness: double (nullable = true) 
     |-- education: double (nullable = true) 
     |-- occupation: double (nullable = true) 
     |-- rating: double (nullable = true) 
     |-- gender: string (nullable = true) 
     |-- children: string (nullable = true) 
     |-- genderIndex: double (nullable = true) 
     |-- genderVec: vector (nullable = true) 
     |-- childrenIndex: double (nullable = true) 
     |-- childrenVec: vector (nullable = true) 
     |-- features: vector (nullable = true) 
     |-- scaledFeatures: vector (nullable = true) 
     |-- pcaFeatures: vector (nullable = true) 
       
       
    pcaDF.select("features", "scaledFeatures", "pcaFeatures").show 
    +--------------------+--------------------+--------------------+ 
    |            features|      scaledFeatures|         pcaFeatures| 
    +--------------------+--------------------+--------------------+ 
    |[0.0,37.0,10.0,3....|[-0.4413500298573...|[0.27828160409293...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.42147114101165...| 
    |[0.0,32.0,15.0,1....|[-0.4413500298573...|[0.18301418047489...| 
    |[0.0,57.0,15.0,5....|[-0.4413500298573...|[-2.9795960667914...| 
    |[0.0,22.0,0.75,2....|[-0.4413500298573...|[1.79299133565688...| 
    |[0.0,32.0,1.5,2.0...|[-0.4413500298573...|[2.65694237441759...| 
    |[0.0,22.0,0.75,2....|[-0.4413500298573...|[3.48234503794570...| 
    |[0.0,57.0,15.0,2....|[-0.4413500298573...|[-2.4215838062079...| 
    |[0.0,32.0,15.0,4....|[-0.4413500298573...|[-0.6964555195741...| 
    |[0.0,22.0,1.5,4.0...|[-0.4413500298573...|[2.18771069800414...| 
    |[0.0,37.0,15.0,2....|[-0.4413500298573...|[-2.4259075891377...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[-0.7743038356008...| 
    |[0.0,47.0,15.0,5....|[-0.4413500298573...|[-2.6176149267534...| 
    |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[2.95788535193022...| 
    |[0.0,27.0,4.0,4.0...|[-0.4413500298573...|[2.50146472861263...| 
    |[0.0,37.0,15.0,1....|[-0.4413500298573...|[-0.5123817022008...| 
    |[0.0,37.0,15.0,2....|[-0.4413500298573...|[-0.9191740114044...| 
    |[0.0,22.0,0.75,3....|[-0.4413500298573...|[2.97391491782863...| 
    |[0.0,22.0,1.5,2.0...|[-0.4413500298573...|[3.17940505267806...| 
    |[0.0,27.0,10.0,2....|[-0.4413500298573...|[0.74585406839527...| 
    +--------------------+--------------------+--------------------+ 
    only showing top 20 rows 
    

    7.聚类

    // 注意最大迭代次數和轮廓系数 
       
    val KSSE = (2 to 20 by 1).toList.map { k =>
          // 聚类
          // Trains a k-means model.
          val kmeans = new KMeans().setK(k).setSeed(1L).setFeaturesCol("scaledFeatures")
          val model = kmeans.fit(scaledData)
    
          // Evaluate clustering by computing Within Set Sum of Squared Errors.
          val WSSSE = model.computeCost(scaledData)
    
          // K,实际迭代次数,SSE,聚类类别编号,每类的记录数,类中心点
          (k, model.getMaxIter, WSSSE, model.summary.cluster, model.summary.clusterSizes, model.clusterCenters)
        }
    
        // 根据SSE确定K值
        val KSSEdf:DataFrame=KSSE.map{x=>(x._1,x._2,x._3,x._5)}.toDF("K", "MaxIter", "SSE", "clusterSizes")
       
    KSSE.foreach(println) 
    
  • 相关阅读:
    Python异常详解:基类、具体异常、异常层次结构
    Python视频教程,百度云资源,免费分享
    Python学习路线图(内附14张思维导图)
    Python视频教程免费下载,最新Python免费教程视频分享!
    怎样通过互联网ssh访问家里电脑
    linux下,把屏幕竖起来
    python中函数的不定长参数
    python中全局变量和局部变量
    vbox+Vagrant 入门指南
    python中函数返回多个值
  • 原文地址:https://www.cnblogs.com/wwxbi/p/6028175.html
Copyright © 2011-2022 走看看