1. 没有插入值的字段(即为Null的字段),在HBase中不会存储实际数据。
创建表: create table t1 (id varchar not null primary key, cf1.a varchar, cf1.b varchar, cf2.c varchar,cf2.d varchar, cf2.e varchar );
插入数据到c、d字段,a、b、e字段为Null :upsert into t1(id,c,d) values('0000000001','ccc','ddd');
查看数据:
select * from t1;
+------------+------------+------------+------------+------------+------------+
| ID | A | B | C | D | E |
+------------+------------+------------+------------+------------+------------+
| 0000000001 | null | null |
ccc | ddd | null |
+------------+------------+------------+------------+------------+------------+
hbase(main):028:0> scan 'T1'
ROW COLUMN+CELL
0000000001 column=CF1:_0, timestamp=1415238625585, value=
0000000001 column=CF2:C, timestamp=1415238625585, value=ccc
0000000001 column=CF2:D, timestamp=1415238625585, value=ddd
插入数据到a、d字段,b,c ,e字段为Null : upsert into t1 values('0000000002','aaa',null,null,'ddd',null);
select * from t1;
+------------+------------+------------+------------+------------+------------+
| ID | A | B | C | D | E |
+------------+------------+------------+------------+------------+------------+
| 0000000001 | null | null |
ccc | ddd | null |
| 0000000002 | aaa | null |
null | ddd | null
|
+------------+------------+------------+------------+------------+------------+
hbase(main):029:0> scan 'T1'
ROW COLUMN+CELL
0000000001 column=CF1:_0, timestamp=1415238625585, value=
0000000001 column=CF2:C, timestamp=1415238625585, value=ccc
0000000001 column=CF2:D, timestamp=1415238625585, value=ddd
0000000002 column=CF1:A, timestamp=1415238882597, value=aaa
0000000002 column=CF1:_0, timestamp=1415238882597, value=
0000000002 column=CF2:D, timestamp=1415238882597, value=ddd
2、索引结构
创建索引:create index index_t1_a_b_in_c_d on t1(a,c) include (b,d)
实际创建一个表:
hbase(main):031:0> describe 'INDEX_T1_A_C_IN_B_D'
'INDEX_T1_A_C_IN_B_D', {TABLE_ATTRIBUTES => {MAX_FILESIZE => '7516192768', coprocessor$1 => '|org.apache.phoenix.coprocessor.ScanRegionObserver|1|', coprocessor$2 => '|org.apache.phoenix.coprocessor.UngroupedAggregateRegionObserver|1|', coprocessor$3 =>oprocessor$4=>'|org.apache.phoenix.coprocessor.ServerCachingEndpointImpl|1|',
coprocessor$5 => '|org.apache.hadoop.hbase.regionserver.LocalIndexSplitter|1|'}, { NAME =>'CF1', DATA_BLOCK_ENCODING => 'FAST_DIFF', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION => 'NONE', MIN_VERSIONS => '0',
TTL => 'FOREVER', KEEP_DELETED_CELLS => 'true', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}, {NAME => 'CF2', DATA_BLOCK_ENCODING => 'FAST_DIFF', BLOOMFILTER => 'ROW', REPLICATION_SCOPE => '0', VERSIONS => '1', COMPRESSION
=> 'NONE', MIN_VERSIONS => '0',TTL => 'FOREVER', KEEP_DELETED_CELLS => 'true', BLOCKSIZE => '65536', IN_MEMORY => 'false', BLOCKCACHE => 'true'}
插入数据:
upsert into t1 values('0000000001','10','11','12','13',"14");
upsert into t1 values('0000000002','20','21','22','23',"24");
upsert into t1 values('0000000003','30','31','32','33',"24");
select * from t1;
+------------+------------+------------+------------+----------+------------+
| ID | A | B | C | D | E |
+------------+------------+------------+------------+----------+------------+
| 0000000001 | 10 | 11 | 12 | 13 | 14 |
| 0000000002 | 20 | 21 | 22 | 23 | 24 |
| 0000000003 | 30 | 31 | 32 | 33 | 34 |
+------------+------------+------------+------------+----------+------------+
索引INDEX_T1_A_C_IN_B_D
select * from INDEX_T1_A_C_IN_B_D;
+------------+------------+------------+------------+------------+
| CF1:A | CF2:C | :ID | CF1:B | CF2:D |
+------------+------------+------------+------------+------------+
| 10 | 12 | 0000000001 | 11 | 13 |
| 20 | 22 | 0000000002 | 21 | 23 |
| 30 | 32 | 0000000003 | 31 | 33 |
+------------+------------+------------+------------+------------+
scan 'INDEX_T1_A_C_IN_B_D'
ROW COLUMN+CELL
10x0012x000000000001 column=CF1:CF1:B, timestamp=1415240129108, value=11
10x0012x000000000001 column=CF1:_0, timestamp=1415240129108, value=
10x0012x000000000001 column=CF2:CF2:D, timestamp=1415240129108, value=13
20x0022x000000000002 column=CF1:CF1:B, timestamp=1415240135689, value=21
20x0022x000000000002 column=CF1:_0, timestamp=1415240135689, value=
20x0022x000000000002 column=CF2:CF2:D, timestamp=1415240135689, value=23
30x0032x000000000003 column=CF1:CF1:B, timestamp=1415240148794, value=31
30x0032x000000000003 column=CF1:_0, timestamp=1415240148794, value=
30x0032x000000000003 column=CF2:CF2:D, timestamp=1415240148794, value=33
3、如果索引中有null
upsert into t1 values('0000000004',null,'31',null,'33', '34');
| ID | A | B | C | D | E |
+------------+------------+------------+------------+---------+----------+
| 0000000004 | null | 31 | null | 33 | 34 |
+------------+------------+------------+------------+----------+---------+
索引INDEX_T1_A_C_IN_B_D数据:
+------------+------------+--------------------+-------------+------------+
| CF1:A | CF2:C | :ID | CF1:B | CF2:D |
+------------+------------+--------------------+------------+------------+
| null | null | 0000000004 | 31 | 33 |
+------------+------------+--------------------+-------------+------------+
ROW COLUMN+CELL
x00x000000000004 column=CF1:CF1:B, timestamp=1415240916266, value=31
x00x000000000004 column=CF1:_0, timestamp=1415240916266, value=
x00x000000000004 column=CF2:CF2:D, timestamp=1415240916266, value=33
4. 索引使用
其定义为:
create index index_t1_a_b_in_c_d on t1(a,c) include (b,d)
(1)、索引中没有包括字段 e, 故只有select字段中有e,就不会用到索引:
explain select * from t1 where a='10' ;
CLIENT PARALLEL 1-WAY FULL SCAN OVER T1
SERVER FILTER BY CF1.A = '10'
explain select a, e from t1 where a='10';
CLIENT PARALLEL 1-WAY FULL SCAN OVER T1
SERVER FILTER BY CF1.A = '10'
(2)、不使用e,则可以
explain select a,b,c,d from t1 where a='10';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['10']
explain select d,c,b,a from t1 where a='10' and c='12';
|CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['10','12']
(3)、索引 a、b 的顺序可以调整
explain select d,c,b,a from t1 where c='10' and a='12';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['12','10']
explain select d,c,b,a from t1 where c>'10' and a='12';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_B_IN_C_D ['12','10'] - ['12',*]
explain select d,c,b,a from t1 where c='10' and a>'12';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['12'] - [*]
SERVER FILTER BY C = '10'
explain select d,c,b,a from t1 where c>'10' and a>'12';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['12'] - [*]
SERVER FILTER BY C > '10'
(4)、混合使用索引字段a 和include 字段b,可以用到索引
explain select a,b,c,d from t1 where a='10' and b='11';
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['10']
SERVER FILTER BY CF1.B = '11'
(5)、混合使用索引字段a 和非include 字段e,则用不到索引
explain select d,c,b,a from t1 where a='10' and e='12';
CLIENT PARALLEL 1-WAY FULL SCAN OVER T1
SERVER FILTER BY (CF1.A = '10' AND CF2.E = '12')
(6)、Group By 可以使用索引
explain select b, max(c), min(d) from t1 where a>'10' group by b
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['10'] - [*]
SERVER AGGREGATE INTO DISTINCT ROWS BY [CF1.B]
CLIENT MERGE SORT
explain select a, max(c), min(d) from t1 where b>'10' group by a
CLIENT PARALLEL 32-WAY FULL SCAN OVER INDEX_T1_A_C_IN_B_D
SERVER FILTER BY CF1.B > '10'
SERVER AGGREGATE INTO ORDERED DISTINCT ROWS BY [A]
CLIENT MERGE SORT
explain select a, max(b), min(d) from t1 where c>'10' group by a;
SERVER FILTER BY C > '10'
SERVER AGGREGATE INTO ORDERED DISTINCT ROWS BY [A]
CLIENT MERGE SORT
SERVER FILTER BY CF1.B > '11'
SERVER AGGREGATE INTO ORDERED DISTINCT ROWS BY [A]
CLIENT MERGE SORT
explain select a, count(b), max(c), min(d) from t1 where a>'1' and c>'11' group by a
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['1'] - [*]
SERVER FILTER BY C > '11'
SERVER AGGREGATE INTO ORDERED DISTINCT ROWS BY [A]
CLIENT MERGE SORT
explain select count(a), count(b), c, min(d) from t1 where a>'1' and c>'11' group by c ;
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_C_IN_B_D ['1'] - [*]
SERVER FILTER BY C > '11'
SERVER AGGREGATE INTO DISTINCT ROWS BY [C]
CLIENT MERGE SORT
explain select count(a), count(b), c, min(d) from t1 where a='10' and c>'11' group by c
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_B_IN_C_D ['10','11'] - ['10',*]
SERVER AGGREGATE INTO DISTINCT ROWS BY [C]
CLIENT MERGE SORT
explain select count(a), b, max(c), min(d) from t1 where a='10' and c>'11' group by b ;
CLIENT PARALLEL 32-WAY RANGE SCAN OVER INDEX_T1_A_B_IN_C_D ['10','11'] - ['10',*]
SERVER AGGREGATE INTO DISTINCT ROWS BY [CF1.B]
CLIENT MERGE SORT