pyspark dataframe save into hive

zoukankan html css js c++ java

pyspark dataframe save into hive

# 先定义dataframe各列的数据类型

from pyspark.sql.types import *
schema = StructType([
StructField("a", NullType(), True),
StructField("b", AtomicType(), True),
StructField("c", NumericType(), True),
StructField("d", IntegralType(), True),
StructField("e", FractionalType(), True),
StructField("f", StringType(), True),
StructField("g", BinaryType(), True),
StructField("h", BooleanType(), True),
StructField("i", DateType(), True),
StructField("j", TimestampType(), True),
StructField("k", DecimalType(), True),
StructField("l", DoubleType(), True),
StructField("m", FloatType(), True),
StructField("n", ByteType(), True),
StructField("o", IntegerType(), True),
StructField("p", LongType(), True),
StructField("q", ShortType(), True),
StructField("r", ArrayType(), True),
StructField("s", MapType(), True)])

# 通过定义好的dataframe的schema来创建空dataframe
df1 = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

from pyspark.sql.types import *
schema = StructType([
StructField("a", NullType(), True),
StructField("b", BooleanType(), True),
StructField("c", ByteType(), True),
StructField("d", ShortType(), True),
StructField("e", IntegerType(), True),
StructField("f", LongType(), True),
StructField("g", FloatType(), True),
StructField("h", DoubleType(), True),
StructField("i", DecimalType(), True),
StructField("j", StringType(), True),
StructField("k", BinaryType(), True),
StructField("l", DateType(), True),
StructField("m", TimestampType(), True),
StructField("n", ArrayType(StringType()), True),
StructField("o", MapType(StringType(), IntegerType()), True)])
=================================================================
from pyspark.sql.types import *
schema = StructType([
StructField("b", BooleanType(), True),
StructField("c", ByteType(), True),
StructField("d", ShortType(), True),
StructField("e", IntegerType(), True),
StructField("f", LongType(), True),
StructField("g", FloatType(), True),
StructField("h", DoubleType(), True),
StructField("i", DecimalType(), True),
StructField("j", StringType(), True),
StructField("k", BinaryType(), True),
StructField("l", DateType(), True),
StructField("m", TimestampType(), True),
StructField("n", ArrayType(StringType()), True),
StructField("o", MapType(StringType(), IntegerType()), True)])

df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

=============================================================================
pyspark 创建dataframe
>>> from pyspark.sql.types import *
>>> schema = StructType([
... StructField("b", BooleanType(), True),
... StructField("c", ByteType(), True),
... StructField("d", ShortType(), True),
... StructField("e", IntegerType(), True),
... StructField("f", LongType(), True),
... StructField("g", FloatType(), True),
... StructField("h", DoubleType(), True),
... StructField("i", DecimalType(), True),
... StructField("j", StringType(), True),
... StructField("k", BinaryType(), True),
... StructField("l", DateType(), True),
... StructField("m", TimestampType(), True),
... StructField("n", ArrayType(StringType()), True),
... StructField("o", MapType(StringType(), IntegerType()), True)])
>>> df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
>>> df.show()
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| b| c| d| e| f| g| h| i| j| k| l| m| n| o|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+

>>> df.printSchema()
root
|-- b: boolean (nullable = true)
|-- c: byte (nullable = true)
|-- d: short (nullable = true)
|-- e: integer (nullable = true)
|-- f: long (nullable = true)
|-- g: float (nullable = true)
|-- h: double (nullable = true)
|-- i: decimal(10,0) (nullable = true)
|-- j: string (nullable = true)
|-- k: binary (nullable = true)
|-- l: date (nullable = true)
|-- m: timestamp (nullable = true)
|-- n: array (nullable = true)
| |-- element: string (containsNull = true)
|-- o: map (nullable = true)
| |-- key: string
| |-- value: integer (valueContainsNull = true)

#保存dataframe 到hive ,默认default 数据库
>>> df.write.saveAsTable("pysparkdf")

beeline -u jdbc:hive2://hdp-node3:10000 -n hadoop
0: jdbc:hive2://hdp-node3:10000> show databases;
+------------------------+--+
| database_name |
+------------------------+--+
| da_component_instance |
| default |
| fileformatdb |
| ods |
| test |
+------------------------+--+
5 rows selected (0.6 seconds)
0: jdbc:hive2://hdp-node3:10000> use default;
No rows affected (0.493 seconds)
0: jdbc:hive2://hdp-node3:10000> show tables;
+------------------------------+--+
| tab_name |
+------------------------------+--+
| liutest |
| pysparkdf |
+------------------------------+--+
51 rows selected (0.523 seconds)
0: jdbc:hive2://hdp-node3:10000> desc pysparkdf;
+-----------+------------------+----------+--+
| col_name | data_type | comment |
+-----------+------------------+----------+--+
| b | boolean | |
| c | tinyint | |
| d | smallint | |
| e | int | |
| f | bigint | |
| g | float | |
| h | double | |
| i | decimal(10,0) | |
| j | string | |
| k | binary | |
| l | date | |
| m | timestamp | |
| n | array<string> | |
| o | map<string,int> | |
+-----------+------------------+----------+--+
14 rows selected (0.61 seconds)

查看全文

相关阅读:
python的sorted相关
 dict两种遍历方法
 python 深拷贝和浅拷贝浅析
 牛人总结python中string模块各属性以及函数的用法，果断转了，好东西
 Python二分查找
 堆和栈区别
 一次完整的HTTP事务是怎样一个过程？(转)
------shell学习
 BZOJ1025 [SCOI2009]游戏
 BZOJ1024 [SCOI2009]生日快乐

原文地址：https://www.cnblogs.com/songyuejie/p/14289283.html