1. 过滤器
要完成一个过滤的操作,至少需要两个参数。一个是抽象的操作符,Hbase
提供了枚举类型的变量来表示这些抽象的操作符:LESS/LESS_OR_EQUAL/EQUAL/NOT_EUQAL
等;另外一个就是具体的比较器(Comparator
),代表具体的比较逻辑,如果可以提高字节级的比较、字符串级的比较等。有了这两个参数,我们就可以清晰的定义筛选的条件,过滤数据。
抽象操作符(比较运算符)
LESS <****
LESS_OR_EQUAL <=
EQUAL =
NOT_EQUAL <>
GREATER_OR_EQUAL >=
GREATER >
NO_OP 排除所有
比较器(指定比较机制)
BinaryComparator 按字节索引顺序比较指定字节数组,采用 Bytes.compareTo(byte[])
BinaryPrefixComparator 跟前面相同,只是比较左端的数据是否相同
NullComparator 判断给定的是否为空
BitComparator 按位比较
RegexStringComparator 提供一个正则的比较器,仅支持 EQUAL 和非 EQUAL
SubstringComparator 判断提供的子串是否出现在 value 中
2. 比较过滤器
2.1 行键过滤器
过滤出 rowkey
大于 10004 的数据:
// 过滤器
public static void scanFilterData(String tableName) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
// GREATER 大于、 BinaryComparator 按字节索引顺序比较指定字节数组
Filter rowFilter = new RowFilter(CompareFilter.CompareOp.GREATER, new BinaryComparator(Bytes.toBytes("10004")));
Scan scan = new Scan();
scan.setFilter(rowFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result: resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell: cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()));
System.out.println("列族: " + Bytes.toString(CellUtil.cloneFamily(cell)));
System.out.println("列: " + Bytes.toString(CellUtil.cloneQualifier(cell)));
System.out.println("值: " + Bytes.toString(CellUtil.cloneValue(cell)));
}
}
table.close();
}
测试:
// t2 表中所有数据
hbase(main):008:0> scan 't2'
ROW COLUMN+CELL
10004 column=info:alias2, timestamp=1628383262854, value=jun2
10011 column=info:alias4, timestamp=1628383262854, value=jun4
10016 column=info:alias5, timestamp=1628383262854, value=jun5
3 row(s) in 0.1140 seconds
// 过滤器
scanFilterData("t2");
行键: 10011
列族: info
列: alias4
值: jun4
行键: 10016
列族: info
列: alias5
值: jun5
2.2 列族过滤器
过滤 info
列族:
// 列族过滤器
public static void scanFilterCf(String tableName, String cf) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
// 获取列族为 info 的记录
Filter cfFilter = new FamilyFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(cf)));
Scan scan = new Scan();
scan.setFilter(cfFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell))
);
}
}
table.close();
}
测试:
scanFilterCf("t2", "info");
行键: 10004 列族: info 列: alias2 值: jun2
行键: 10011 列族: info 列: alias4 值: jun4
行键: 10016 列族: info 列: alias5 值: jun5
// 不存在的列族
scanFilterCf("t2", "info2");
2.3 列过滤器
// 获取列为 alias2 的记录
Filter qualifierFilter = new QualifierFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(cn)));
Scan scan = new Scan();
scan.setFilter(qualifierFilter);
测试:
scanFilterCn("t2", "alias2");
行键: 10004 列族: info 列: alias2 值: jun2
2.4 值过滤器
// 获取值为 jun5 的记录
Filter valueFilter = new ValueFilter(CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(value)));
Scan scan = new Scan();
scan.setFilter(valueFilter);
测试:
scanFilterValue("t2", "jun5");
行键: 10016 列族: info 列: alias5 值: jun5
2.5 时间戳过滤器
public static void scanFilterTimestamp(String tableName, long Timestamp) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
List<Long> list = new ArrayList<>();
list.add(Timestamp);
// 获取时间戳为 1628383262854 的记录
TimestampsFilter timestampsFilter = new TimestampsFilter(list);
Scan scan = new Scan();
scan.setFilter(timestampsFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" 时间戳: " + cell.getTimestamp()
);
}
}
table.close();
}
测试:
scanFilterTimestamp("t2", 1628383262854L);
行键: 10004 列族: info 列: alias2 值: jun2 时间戳: 1628383262854
行键: 10011 列族: info 列: alias4 值: jun4 时间戳: 1628383262854
行键: 10016 列族: info 列: alias5 值: jun5 时间戳: 1628383262854
3. 专用过滤器
3.1 单列值过滤器
public static void scanFilterSingleValue(String tableName, String cf, String cn, String value) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
SingleColumnValueFilter singleColumnValueFilter = new SingleColumnValueFilter(
Bytes.toBytes(cf),
Bytes.toBytes(cn),,
CompareFilter.CompareOp.EQUAL,
new SubstringComparator(value)
);
//如果不设置为 true,则那些不包含指定 column 的行也会返回
singleColumnValueFilter.setFilterIfMissing(true);
Scan scan = new Scan();
scan.setFilter(singleColumnValueFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" 时间戳: " + cell.getTimestamp()
);
}
}
table.close();
}
测试:
scanFilterSingleValue("t2", "info", "alias2", "jun2");
行键: 10004 列族: info 列: alias2 值: jun2 时间戳: 1628383262854
// 注释 singleColumnValueFilter.setFilterIfMissing(true);
行键: 10004 列族: info 列: alias2 值: jun2 时间戳: 1628383262854
行键: 10011 列族: info 列: alias4 值: jun4 时间戳: 1628383262854
行键: 10016 列族: info 列: alias5 值: jun5 时间戳: 1628383262854
3.2 单列值排除器
public static void scanFilterSingleExcludeValue(String tableName, String cf, String cn, String value) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
SingleColumnValueExcludeFilter singleColumnValueExcludeFilter = new SingleColumnValueExcludeFilter(
Bytes.toBytes(cf),
Bytes.toBytes(cn),
CompareFilter.CompareOp.EQUAL,
Bytes.toBytes(value)
// new SubstringComparator(value)
);
//如果不设置为 true,则那些不包含指定 column 的行也会返回
singleColumnValueExcludeFilter.setFilterIfMissing(true);
Scan scan = new Scan();
scan.setFilter(singleColumnValueExcludeFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" 时间戳: " + cell.getTimestamp()
);
}
}
table.close();
}
测试:
scanFilterSingleExcludeValue("t2", "info", "alias2", "jun2");
3.3 前缀过滤器(针对行键)
public static void scanFilterPrefix(String tableName, String rowKeyPrefix) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
PrefixFilter prefixFilter = new PrefixFilter(Bytes.toBytes(rowKeyPrefix));
Scan scan = new Scan();
scan.setFilter(prefixFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" 时间戳: " + cell.getTimestamp()
);
}
}
table.close();
}
测试:
scanFilterPrefix("t2", "1001");
行键: 10011 列族: info 列: alias4 值: jun4 时间戳: 1628383262854
行键: 10016 列族: info 列: alias5 值: jun5 时间戳: 1628383262854
3.4 列前缀过滤器
public static void scanFilterColumnPrefix(String tableName, String CnPrefix) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
ColumnPrefixFilter columnPrefixFilter = new ColumnPrefixFilter(Bytes.toBytes(CnPrefix));
Scan scan = new Scan();
scan.setFilter(columnPrefixFilter);
ResultScanner resultScanner = table.getScanner(scan);
for (Result result : resultScanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println("行键: " + Bytes.toString(result.getRow()) +
" 列族: " + Bytes.toString(CellUtil.cloneFamily(cell)) +
" 列: " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
" 值: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" 时间戳: " + cell.getTimestamp()
);
}
}
table.close();
}
测试:
scanFilterColumnPrefix("t2", "ali");
行键: 10004 列族: info 列: alias2 值: jun2 时间戳: 1628383262854
行键: 10011 列族: info 列: alias4 值: jun4 时间戳: 1628383262854
行键: 10016 列族: info 列: alias5 值: jun5 时间戳: 1628383262854
3.5 分页过滤器
public static void scanFilterPage(String tableName, int pageNum) throws IOException {
Table table = connection.getTable(TableName.valueOf(tableName));
PageFilter pageFilter = new PageFilter(pageNum);
Scan scan = new Scan();
scan.setFilter(pageFilter);
ResultScanner resultScanner = table.getScanner(scan);
// // 获取最后一行的 rowkey,为lastRowkey加上了一个0字节(byte数组初始化
// //后默认填入的就是0字节),不希望第二次的Scan结果集把第一次的最后一条记录包含进去
// byte[] lastRowKey = getLastRowKey(resultScanner);
// System.out.println("lastRowKey: " + Bytes.toString(lastRowKey));
//
// // 获取第 2 页
// byte[] startRowKey = Bytes.add(lastRowKey, new byte[1]);
// scan.setStartRow(startRowKey);
// ResultScanner rs2 = table.getScanner(scan);
//
// getLastRowKey(rs2);
// 循环获取所有数据
while (true) {
byte[] lastRowKey = getLastRowKey(resultScanner);
if (lastRowKey == null) {
break;
}
// 获取下一页
byte[] startRowKey = Bytes.add(lastRowKey, new byte[1]);
scan.setStartRow(startRowKey);
resultScanner = table.getScanner(scan);
}
table.close();
}
// 获取最后记录的 rowKey
private static byte[] getLastRowKey(ResultScanner rs) {
byte[] lastRowKey = null;
for (Result r : rs) {
byte[] rowkey = r.getRow();
lastRowKey = rowkey;
System.out.println("rowkey: " + Bytes.toString(rowkey));
}
return lastRowKey;
}
测试:
// 设置分页数量为 2,总共有 3 行
scanFilterPage("t2", 2);
rowkey: 10004
rowkey: 10011
rowkey: 10016
注意:
PageFilte
不能实现翻页,如果想翻页就得记录上一次翻页的最后一个rowkey