1.用户自定义schema
data
json串格式如下:
{
"partner_code": "demo",
"app_name": "web",
"person_info": {
"name": "张三",
"age": 18
},
"items": [
{
"item_id": 1,
"item_name": "王家村",
"group": "group1"
},
{
"item_id": 2,
"item_name": "李家澡堂",
"item_detail": {
"platform_count": 2
},
"group": "group2"
}
]
}
spark1.3
在spark1.3我们是这样处理的
//定义schema
val struct =StructType(
StructField("partner_code", StringType, true) ::
StructField("app_name", StringType, true)::
StructField("person_info",MapType(StringType,StringType,true)) ::
StructField("items",ArrayType(MapType(StringType,StringType,true))) ::
Nil)
val data = sc.textFile("path/jsonFile")
val df = sqlContext.jsonRDD(data,struct)
df.printSchema
df.show
spark1.4
//定义schema
val struct =StructType(
StructField("partner_code", StringType, true) ::
StructField("app_name", StringType, true)::
StructField("person_info",MapType(StringType,StringType,true)) ::
StructField("items",ArrayType(MapType(StringType,StringType,true))) ::
Nil)
val df = sqlContext.read.schema(struct).json("path/jsonFile")
输出结果
//df.printSchema
root
|-- partner_code: string (nullable = true)
|-- app_name: string (nullable = true)
|-- person_info: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- items: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
//df.show
+------------+--------+--------------------+--------------------+
|partner_code|app_name| person_info| items|
+------------+--------+--------------------+--------------------+
| demo| web|Map(name -> 张三, a...|List(Map(item_id ...|
+------------+--------+--------------------+--------------------+
系统自动生成schema
直接使用自带的解析会更方便,不过那样会产生大量的struct结构,同时如果结构复杂多变将会产生大量的空值。
//不需要定义schema,系统自动判断生成
val df = sqlContext.read.json("path/jsonFile")
df.printSchema
df.show
输出结果
//df.printSchema
root
|-- app_name: string (nullable = true)
|-- items: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- group: string (nullable = true)
| | |-- item_detail: struct (nullable = true)
| | | |-- platform_count: long (nullable = true)
| | |-- item_id: long (nullable = true)
| | |-- item_name: string (nullable = true)
|-- partner_code: string (nullable = true)
|-- person_info: struct (nullable = true)
| |-- age: long (nullable = true)
| |-- name: string (nullable = true)
//df.show
+--------+--------------------+------------+-----------+
|app_name| items|partner_code|person_info|
+--------+--------------------+------------+-----------+
| web|List([group1,null...| demo| [18,张三]|
+--------+--------------------+------------+-----------+