zoukankan      html  css  js  c++  java
  • Spark2 探索性数据统计分析

    data数据源,请参考我的博客http://www.cnblogs.com/wwxbi/p/6063613.html

    import org.apache.Spark.sql.DataFrameStatFunctions

    import org.apache.spark.sql.functions._

    相关系数

    val df = Range(0,10,step=1).toDF("id").withColumn("rand1", rand(seed=10)).withColumn("rand2", rand(seed=27))
    df: org.apache.spark.sql.DataFrame = [id: int, rand1: double ... 1 more field]
    
    df.show
    +---+-------------------+-------------------+
    | id|              rand1|              rand2|
    +---+-------------------+-------------------+
    |  0|0.41371264720975787|  0.714105256846827|
    |  1| 0.7311719281896606| 0.8143487574232506|
    |  2| 0.9031701155118229| 0.5282207324381174|
    |  3|0.09430205113458567| 0.4420100497826609|
    |  4|0.38340505276222947| 0.9387162206758006|
    |  5| 0.5569246135523511| 0.6398126862647711|
    |  6| 0.4977441406613893| 0.9895498513115722|
    |  7| 0.2076666106201438| 0.3398720242725498|
    |  8| 0.9571919406508957|0.15042237695815963|
    |  9| 0.7429395461204413| 0.7302723457066639|
    +---+-------------------+-------------------+
    
    
    df.stat.corr("rand1", "rand2", "pearson")
    res24: Double = -0.10993962467082698
    

    查看数据的统计分布情况

    val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating")
    
    // 查看数据的统计分布情况
    val descrDF = data.describe("age", "yearsmarried", "religiousness", "education", "occupation", "rating")
    descrDF: org.apache.spark.sql.DataFrame = [summary: string, age: string ... 5 more fields]
    
    descrDF.selectExpr("summary",
            "round(age,2) as age",
            "round(yearsmarried,2) as yearsmarried",
            "round(religiousness,2) as religiousness",
            "round(education,2) as education",
            "round(occupation,2) as occupation",
            "round(rating,2) as rating").show(10, truncate = false)
    +-------+-----+------------+-------------+---------+----------+------+
    |summary|age  |yearsmarried|religiousness|education|occupation|rating|
    +-------+-----+------------+-------------+---------+----------+------+
    |count  |601.0|601.0       |601.0        |601.0    |601.0     |601.0 |
    |mean   |32.49|8.18        |3.12         |16.17    |4.19      |3.93  |
    |stddev |9.29 |5.57        |1.17         |2.4      |1.82      |1.1   |
    |min    |17.5 |0.13        |1.0          |9.0      |1.0       |1.0   |
    |max    |57.0 |15.0        |5.0          |20.0     |7.0       |5.0   |
    +-------+-----+------------+-------------+---------+----------+------+
    

    统计字段中元素的个数

    // 统计字段中元素的个数
    val fi = data.stat.freqItems(colArray)
    fi: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 4 more fields]
    
    fi.printSchema()
    root
     |-- age_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
     |-- yearsmarried_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
     |-- religiousness_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
     |-- education_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
     |-- occupation_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
     |-- rating_freqItems: array (nullable = true)
     |    |-- element: double (containsNull = false)
    
    
    val f = fi.selectExpr(
         |   "size(age_freqItems)",
         |   "size(yearsmarried_freqItems)",
         |   "size(religiousness_freqItems)",
         |   "size(education_freqItems)",
         |   "size(occupation_freqItems)",
         |   "size(rating_freqItems)")
    f: org.apache.spark.sql.DataFrame = [size(age_freqItems): int, size(yearsmarried_freqItems): int ... 4 more fields]
    
    f.show(10, truncate = false)
    +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
    |size(age_freqItems)|size(yearsmarried_freqItems)|size(religiousness_freqItems)|size(education_freqItems)|size(occupation_freqItems)|size(rating_freqItems)|
    +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
    |9                  |8                           |5                            |7                        |7                         |5                     |
    +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
    

    集合字段的元素

    // 集合字段的元素
    val f1 = data.stat.freqItems(Array("age", "yearsmarried", "religiousness"))
    f1: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 1 more field]
    
    f1.show(10, truncate = false)
    +------------------------------------------------------+-----------------------------------------------+-------------------------+
    |age_freqItems                                         |yearsmarried_freqItems                         |religiousness_freqItems  |
    +------------------------------------------------------+-----------------------------------------------+-------------------------+
    |[32.0, 47.0, 22.0, 52.0, 37.0, 17.5, 27.0, 57.0, 42.0]|[0.75, 0.125, 1.5, 0.417, 4.0, 7.0, 10.0, 15.0]|[2.0, 5.0, 4.0, 1.0, 3.0]|
    +------------------------------------------------------+-----------------------------------------------+-------------------------+
    
    
    // 对数组的元素排序
    
    f1.selectExpr("sort_array(age_freqItems)", "sort_array(yearsmarried_freqItems)", "sort_array(religiousness_freqItems)").show(10, truncate = false)
    +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+
    |sort_array(age_freqItems, true)                       |sort_array(yearsmarried_freqItems, true)       |sort_array(religiousness_freqItems, true)|
    +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+
    |[17.5, 22.0, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0]|[0.125, 0.417, 0.75, 1.5, 4.0, 7.0, 10.0, 15.0]|[1.0, 2.0, 3.0, 4.0, 5.0]                |
    +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+
    
    
    
    
    // 集合字段的元素
    val f2 = data.stat.freqItems(Array("education", "occupation", "rating"))
    f2: org.apache.spark.sql.DataFrame = [education_freqItems: array<double>, occupation_freqItems: array<double> ... 1 more field]
    
    f2.show(10, truncate = false)
    +-----------------------------------------+-----------------------------------+-------------------------+
    |education_freqItems                      |occupation_freqItems               |rating_freqItems         |
    +-----------------------------------------+-----------------------------------+-------------------------+
    |[17.0, 20.0, 14.0, 16.0, 9.0, 18.0, 12.0]|[2.0, 5.0, 4.0, 7.0, 1.0, 3.0, 6.0]|[2.0, 5.0, 4.0, 1.0, 3.0]|
    +-----------------------------------------+-----------------------------------+-------------------------+
    
    
    // 对数组的元素排序
    f2.selectExpr("sort_array(education_freqItems)", "sort_array(occupation_freqItems)", "sort_array(rating_freqItems)").show(10, truncate = false)
    +-----------------------------------------+--------------------------------------+----------------------------------+
    |sort_array(education_freqItems, true)    |sort_array(occupation_freqItems, true)|sort_array(rating_freqItems, true)|
    +-----------------------------------------+--------------------------------------+----------------------------------+
    |[9.0, 12.0, 14.0, 16.0, 17.0, 18.0, 20.0]|[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]   |[1.0, 2.0, 3.0, 4.0, 5.0]         |
    +-----------------------------------------+--------------------------------------+----------------------------------+
    
  • 相关阅读:
    (转)Nginx配置和内核优化 实现突破十万并发
    (转)资源监控工具Spotlight监测LINUX
    (转) linux I/O优化 磁盘读写参数设置
    (转)MongoDB numa系列问题三:overcommit_memory和zone_reclaim_mode
    (转)mongdb性能优化收集
    (转)部署MongoDB时需要注意的调参
    (转)Loadrunner监控Linux的17个指标
    (转)linux 内存管理——内核的shmall 和shmmax 参数
    (转)Linux性能调优之虚拟内存篇
    (转)LR监控Linux系统性能计数器详解
  • 原文地址:https://www.cnblogs.com/wwxbi/p/6125363.html
Copyright © 2011-2022 走看看