zoukankan      html  css  js  c++  java
  • spark 练习

    scala> import org.apache.spark.SparkContext

    import org.apache.spark.SparkContext

    scala> import org.apache.spark.SparkConf

    import org.apache.spark.SparkConf

    scala> import org.apache.spark.sql.SQLContext

    import org.apache.spark.sql.SQLContext

    scala> import spark.implicits._

    import spark.implicits._

    scala> val mysqlcon=new SQLContext(sc)
    warning: there was one deprecation warning; re-run with -deprecation for details
    mysqlcon: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@3ac76ad9

    scala> val mysqldf=mysqlcon.read.format("jdbc").options(Map("url"->"jdbc:mysql://localhost:3306/test","user"->"root","password"->"root","dbtable"->"Account_3004")).load()
    mysqldf: org.apache.spark.sql.DataFrame = [AccountName: string, Accid: bigint ... 30 more fields]

    scala> mysqldf.printSchema
    root
     |-- AccountName: string (nullable = false)
     |-- Accid: long (nullable = false)
     |-- platid: integer (nullable = false)
     |-- DateID: integer (nullable = false)
     |-- CreateTime: timestamp (nullable = false)
     |-- Retention1: integer (nullable = false)
     |-- Retention2: integer (nullable = true)
     |-- Retention3: integer (nullable = true)
     |-- Retention4: integer (nullable = true)
     |-- Retention5: integer (nullable = true)
     |-- Retention6: integer (nullable = true)
     |-- Retention7: integer (nullable = true)
     |-- Retention10: integer (nullable = true)
     |-- Retention14: integer (nullable = true)
     |-- Retention21: integer (nullable = true)
     |-- Retention30: integer (nullable = true)
     |-- GameID: integer (nullable = false)
     |-- id: long (nullable = false)
     |-- adcode: string (nullable = true)
     |-- AddRMB1: double (nullable = true)
     |-- AddRMB2: double (nullable = true)
     |-- AddRMB3: double (nullable = true)
     |-- AddRMB4: double (nullable = true)
     |-- AddRMB5: double (nullable = true)
     |-- AddRMB6: double (nullable = true)
     |-- AddRMB7: double (nullable = true)
     |-- AddRMB10: double (nullable = true)
     |-- AddRMB14: double (nullable = true)
     |-- AddRMB21: double (nullable = true)
     |-- AddRMB30: double (nullable = true)
     |-- LoginTimes: integer (nullable = true)
     |-- LoginMinutes: integer (nullable = true)


    scala> mysqldf.count()
    res2: Long = 76813

    scala> mysqldf.show(2)
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |AccountName|   Accid|platid|  DateID|         CreateTime|Retention1|Retention2|Retention3|Retention4|Retention5|Retention6|Retention7|Retention10|Retention14|Retention21|Retention30|GameID|   id|adcode|AddRMB1|AddRMB2|AddRMB3|AddRMB4|AddRMB5|AddRMB6|AddRMB7|AddRMB10|AddRMB14|AddRMB21|AddRMB30|LoginTimes|LoginMinutes|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    |           | 1004210|     6|20180116|2018-01-16 10:39:50|         1|         0|         0|         0|         0|         0|         0|          0|          0|          0|          0|  3004|22438|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         1|           7|
    |           |20946754|     0|20170913|2017-09-13 10:02:37|         1|         0|         0|         1|         0|         0|         0|          0|          0|          0|          0|  3004|  167|      |    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|    0.0|     0.0|     0.0|     0.0|     0.0|         3|         219|
    +-----------+--------+------+--------+-------------------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+------+-----+------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+--------+----------+------------+
    only showing top 2 rows

    scala> mysqldf.select("accid").show(2)
    +--------+
    |   accid|
    +--------+
    |20964769|
    |22235886|
    +--------+
    only showing top 2 rows


    scala> mysqldf.select("accid","platid").show(2)
    +--------+------+
    |   accid|platid|
    +--------+------+
    | 1004210|     6|
    |20946754|     0|
    +--------+------+
    only showing top 2 rows


    scala> mysqldf.filter($"dateid">20180510).count
    res9: Long = 5101

    scala> mysqldf.select($"accid",$"platid"+1000).show(2)
    +--------+---------------+
    |   accid|(platid + 1000)|
    +--------+---------------+
    | 1004210|           1006|
    |20946754|           1000|
    +--------+---------------+
    only showing top 2 rows

    scala> mysqldf.groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27| 7157|
    |    93|   44|
    |   291|   10|
    |     1| 8503|
    |    13|  290|
    |     6| 4765|
    |     3| 3281|
    |   295|    2|
    | 10000|    1|
    |   191|  758|
    | 24294|    9|
    |    19| 1549|
    |    15| 8838|
    |    17|    6|
    |     9|  365|
    |   286|    1|
    |    35| 4075|
    |     4|10395|
    |   247|    1|
    |   277|  453|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.filter($"dateid">20180520).groupBy("platid").count().show
    +------+-----+
    |platid|count|
    +------+-----+
    |    27|  131|
    |    93|   14|
    |   291|    2|
    |     1|  333|
    |    13|   25|
    |     6|  116|
    |     3|   36|
    |   191|  136|
    | 24294|    2|
    |    19|   39|
    |    15|  978|
    |     9|    2|
    |    35|   72|
    |     4|  161|
    |   277|   11|
    |    50|    8|
    |    38|    4|
    |   289|   12|
    |    21|   24|
    |    60|   75|
    +------+-----+
    only showing top 20 rows

    scala> mysqldf.createOrReplaceTempView("account")


    scala> val sqldf=spark.sql("select platid,accid,dateid  from account where dateid>=20180601" )
    sqldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]


    scala> sqldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    ------------------------------------

    Spark SQL中的临时性视图在会话范围内,如果创建会话的会话终止,它们将消失。如果您希望拥有一个在所有会话中共享的临时视图,并在Spark应用程序终止之前保持活动状态,您可以创建一个全局临时视图。全局临时视图与系统保存的数据库global_temp绑定,我们必须使用限定名来引用它,例如,从global_temp.view1中选择*。

    --------------------------------------

    scala> mysqldf.createOrReplaceGlobalTempView("tb_acc")

    scala> val globaldf=spark.sql("select platid,accid,dateid  from global_temp.tb_acc where dateid>=20180601" )
    globaldf: org.apache.spark.sql.DataFrame = [platid: int, accid: bigint ... 1 more field]

    scala> globaldf.show(2)
    +------+--------+--------+
    |platid|   accid|  dateid|
    +------+--------+--------+
    |     0|22514097|20180601|
    |    36|22857594|20180601|
    +------+--------+--------+
    only showing top 2 rows

    --------------------------

    Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use a specialized Encoder to serialize the objects for processing or transmitting over the network. While both encoders and standard serialization are responsible for turning an object into bytes, encoders are code generated dynamically and use a format that allows Spark to perform many operations like filtering, sorting and hashing without deserializing the bytes back into an object.

    但是,数据集类似于RDDs,而不是使用Java序列化或Kryo,而是使用专门的编码器将对象序列化,以便在网络上进行处理或传输。尽管编码器和标准序列化都负责将对象转换成字节,编码器是动态生成的代码,并使用允许Spark执行许多操作(如过滤、排序和散列)的格式,而不会将字节反序列化为对象。

    ----------------------------------------------------------------

    scala> val df1=spark.sql("select distinct platid,dateid  from account where dateid>=20180601" )
    df1: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]


    scala> val ds=df1.toDF
    ds: org.apache.spark.sql.DataFrame = [platid: int, dateid: int]

    scala> mysqldf.where("dateid>20180601").count()

    res36: Long = 2249

    scala> mysqldf.filter("dateid>20180601").count()

    res37: Long = 2249

    scala> mysqldf.apply("accid")
    res38: org.apache.spark.sql.Column = accid

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid")).show 顺序

     scala>mysqldf.filter("dateid>20180601").orderBy(-mysqldf("dateid")).show  逆序

    scala> mysqldf.filter("dateid>20180601").orderBy(mysqldf("dateid").desc).show 逆序

    scala> mysqldf.groupBy("platid").agg(max("dateid"),min("dateid")).show(2)
    +------+-----------+-----------+
    |platid|max(dateid)|min(dateid)|
    +------+-----------+-----------+
    |    27|   20180619|   20170906|
    |    93|   20180615|   20180314|
    +------+-----------+-----------+
    only showing top 2 rows

  • 相关阅读:
    51nod 1050 循环数组最大子段和
    51nod 1183 编辑距离
    百度之星初赛一 补题
    jquery click嵌套 事件重复注册 多次执行的问题
    Shell变量赋值语句不能有空格
    Linux得到某个文件夹内文件的个数
    Ubuntu14.04下sogou输入法的输入框只显示英文不显示中文的问题
    Eclipse中mybatis的xml文件没有提示,出现the file cannot be validated as the XML definition.....
    整理一下postgresql的扩展功能postgis和pgrouting的使用
    Windows应用程序未响应
  • 原文地址:https://www.cnblogs.com/playforever/p/9204045.html
Copyright © 2011-2022 走看看