Spark SQL快速入门
本地表
(1)准备数据
[root@node1 ~]# mkdir /tmp/data
[root@node1 ~]# cat data/ml-1m/users.dat |tr -s "::" "," >> /tmp/data/users.dat
[root@node1 ~]# tail -5 /tmp/data/users.dat
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060
6040,M,25,6,11106
[root@node1 ~]#
(2)确认HDFS已经启动
(3)进入spark-sql
[root@node1 ~]# spark-sql
17/10/11 09:49:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/11 09:50:01 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
17/10/11 09:50:01 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
17/10/11 09:50:18 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
spark-sql>
(4)创建表
spark-sql> create external table user(
> userid int,
> gender string,
> age int,
> occupation string,
> zipcode int
> )
> row format delimited fields terminated by ','
> stored as textfile
> location 'file:///tmp/data';
Time taken: 4.032 seconds
spark-sql>
(5)查看表结构
spark-sql> show create table user;
CREATE EXTERNAL TABLE `user`(`userid` int, `gender` string, `age` int, `occupation` string, `zipcode` int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim' = ',',
'serialization.format' = ','
)
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'file:/tmp/data'
TBLPROPERTIES (
'rawDataSize' = '-1',
'numFiles' = '0',
'transient_lastDdlTime' = '1507730977',
'totalSize' = '0',
'COLUMN_STATS_ACCURATE' = 'false',
'numRows' = '-1'
)
Time taken: 1.816 seconds, Fetched 1 row(s)
spark-sql>
(6)查询本地数据表
spark-sql> select * from user limit 10;
1 F 1 10 48067
2 M 56 16 70072
3 M 25 15 55117
4 M 45 7 2460
5 M 25 20 55455
6 F 50 9 55117
7 M 35 1 6810
8 M 25 12 11413
9 M 25 17 61614
10 F 35 1 95370
Time taken: 2.95 seconds, Fetched 10 row(s)
spark-sql>
使用spark-sql处理Hive MeteStore中的表
注意,默认HDFS已经启动。
(1)hive-site.xml
hive可以通过服务的形式对外提供元数据读写操作,修改配置文件 $HIVE_HOME/conf/hive-site.xml,增加如下内容
<property>
<name>hive.metastore.uris</name>
<value>thrift://node:9083</value>
</property>
其中,node是启动metastore服务所在的节点。
我的hive-site.xml内容如下:
[root@node3 ~]# cat /opt/hive-2.1.1/conf/hive-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>thrift://node3:9083</value>
<description>用于metastore客户端连接远程metestore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://node3:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<!--mysql数据库用户名-->
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>mapred.reduce.tasks</name>
<value>2</value>
</property>
</configuration>
[root@node3 ~]#
(2)启动hive metastore
[root@node3 ~]# hive --service hiveserver2 >/dev/null 2>/dev/null &
[root@node3 ~]# hive --service metastore 1>/dev/null 2>&1 &
[root@node3 ~]# jobs
[1]- Running hive --service hiveserver2 > /dev/null 2> /dev/null &
[2]+ Running hive --service metastore > /dev/null 2>&1 &
[root@node3 ~]#
(3)将hive的配置文件复制到spark
[root@node3 ~]# scp /opt/hive-2.1.1/conf/hive-site.xml node1:/opt/spark-2.2.0/conf
(4)启动spark-sql
[root@node1 ~]# spark-sql
17/10/12 09:56:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/12 09:56:15 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
17/10/12 09:56:15 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
spark-sql>
(5)查看Hive数据表
spark-sql> show tables;
default copyemp false
default demo false
default dept false
default dual false
default emp false
default empbak false
default employees false
default mytb false
default users false
Time taken: 4.908 seconds, Fetched 9 row(s)
spark-sql> select * from users;
4 aa
5 bb
6 cc
Time taken: 3.196 seconds, Fetched 3 row(s)
spark-sql>