zoukankan      html  css  js  c++  java
  • pyspark dataframe save into hive

    # 先定义dataframe各列的数据类型

    from pyspark.sql.types import *
    schema = StructType([
    StructField("a", NullType(), True),
    StructField("b", AtomicType(), True),
    StructField("c", NumericType(), True),
    StructField("d", IntegralType(), True),
    StructField("e", FractionalType(), True),
    StructField("f", StringType(), True),
    StructField("g", BinaryType(), True),
    StructField("h", BooleanType(), True),
    StructField("i", DateType(), True),
    StructField("j", TimestampType(), True),
    StructField("k", DecimalType(), True),
    StructField("l", DoubleType(), True),
    StructField("m", FloatType(), True),
    StructField("n", ByteType(), True),
    StructField("o", IntegerType(), True),
    StructField("p", LongType(), True),
    StructField("q", ShortType(), True),
    StructField("r", ArrayType(), True),
    StructField("s", MapType(), True)])

    # 通过定义好的dataframe的schema来创建空dataframe
    df1 = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

    from pyspark.sql.types import *
    schema = StructType([
    StructField("a", NullType(), True),
    StructField("b", BooleanType(), True),
    StructField("c", ByteType(), True),
    StructField("d", ShortType(), True),
    StructField("e", IntegerType(), True),
    StructField("f", LongType(), True),
    StructField("g", FloatType(), True),
    StructField("h", DoubleType(), True),
    StructField("i", DecimalType(), True),
    StructField("j", StringType(), True),
    StructField("k", BinaryType(), True),
    StructField("l", DateType(), True),
    StructField("m", TimestampType(), True),
    StructField("n", ArrayType(StringType()), True),
    StructField("o", MapType(StringType(), IntegerType()), True)])
    =================================================================
    from pyspark.sql.types import *
    schema = StructType([
    StructField("b", BooleanType(), True),
    StructField("c", ByteType(), True),
    StructField("d", ShortType(), True),
    StructField("e", IntegerType(), True),
    StructField("f", LongType(), True),
    StructField("g", FloatType(), True),
    StructField("h", DoubleType(), True),
    StructField("i", DecimalType(), True),
    StructField("j", StringType(), True),
    StructField("k", BinaryType(), True),
    StructField("l", DateType(), True),
    StructField("m", TimestampType(), True),
    StructField("n", ArrayType(StringType()), True),
    StructField("o", MapType(StringType(), IntegerType()), True)])

    df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

    =============================================================================
    pyspark 创建dataframe
    >>> from pyspark.sql.types import *
    >>> schema = StructType([
    ... StructField("b", BooleanType(), True),
    ... StructField("c", ByteType(), True),
    ... StructField("d", ShortType(), True),
    ... StructField("e", IntegerType(), True),
    ... StructField("f", LongType(), True),
    ... StructField("g", FloatType(), True),
    ... StructField("h", DoubleType(), True),
    ... StructField("i", DecimalType(), True),
    ... StructField("j", StringType(), True),
    ... StructField("k", BinaryType(), True),
    ... StructField("l", DateType(), True),
    ... StructField("m", TimestampType(), True),
    ... StructField("n", ArrayType(StringType()), True),
    ... StructField("o", MapType(StringType(), IntegerType()), True)])
    >>> df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
    >>> df.show()
    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+
    | b| c| d| e| f| g| h| i| j| k| l| m| n| o|
    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+
    +---+---+---+---+---+---+---+---+---+---+---+---+---+---+

    >>> df.printSchema()
    root
    |-- b: boolean (nullable = true)
    |-- c: byte (nullable = true)
    |-- d: short (nullable = true)
    |-- e: integer (nullable = true)
    |-- f: long (nullable = true)
    |-- g: float (nullable = true)
    |-- h: double (nullable = true)
    |-- i: decimal(10,0) (nullable = true)
    |-- j: string (nullable = true)
    |-- k: binary (nullable = true)
    |-- l: date (nullable = true)
    |-- m: timestamp (nullable = true)
    |-- n: array (nullable = true)
    | |-- element: string (containsNull = true)
    |-- o: map (nullable = true)
    | |-- key: string
    | |-- value: integer (valueContainsNull = true)

    #保存dataframe 到hive ,默认default 数据库
    >>> df.write.saveAsTable("pysparkdf")

    beeline -u jdbc:hive2://hdp-node3:10000 -n hadoop
    0: jdbc:hive2://hdp-node3:10000> show databases;
    +------------------------+--+
    | database_name |
    +------------------------+--+
    | da_component_instance |
    | default |
    | fileformatdb |
    | ods |
    | test |
    +------------------------+--+
    5 rows selected (0.6 seconds)
    0: jdbc:hive2://hdp-node3:10000> use default;
    No rows affected (0.493 seconds)
    0: jdbc:hive2://hdp-node3:10000> show tables;
    +------------------------------+--+
    | tab_name |
    +------------------------------+--+
    | liutest |
    | pysparkdf |
    +------------------------------+--+
    51 rows selected (0.523 seconds)
    0: jdbc:hive2://hdp-node3:10000> desc pysparkdf;
    +-----------+------------------+----------+--+
    | col_name | data_type | comment |
    +-----------+------------------+----------+--+
    | b | boolean | |
    | c | tinyint | |
    | d | smallint | |
    | e | int | |
    | f | bigint | |
    | g | float | |
    | h | double | |
    | i | decimal(10,0) | |
    | j | string | |
    | k | binary | |
    | l | date | |
    | m | timestamp | |
    | n | array<string> | |
    | o | map<string,int> | |
    +-----------+------------------+----------+--+
    14 rows selected (0.61 seconds)

  • 相关阅读:
    python的sorted相关
    dict两种遍历方法
    python 深拷贝和浅拷贝浅析
    牛人总结python中string模块各属性以及函数的用法,果断转了,好东西
    Python二分查找
    堆和栈区别
    一次完整的HTTP事务是怎样一个过程?(转)
    ------shell学习
    BZOJ1025 [SCOI2009]游戏
    BZOJ1024 [SCOI2009]生日快乐
  • 原文地址:https://www.cnblogs.com/songyuejie/p/14289283.html
Copyright © 2011-2022 走看看