1、创建hbase表
--建表 hbase(main):003:0> create 'test','cf' --查看hdfs目录,此时cf目录下没有数据 [root@node1 ~]# hadoop fs -ls /hbase/data/default/test Found 3 items drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tmp drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508 [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf
2、插入数据
--插数据 hbase(main):005:0> put 'test','user1','cf:name','zhang' --再看cf目录,依然没有数据 [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf
3、flush
--flush hbase(main):006:0> flush 'test' --此时cf目录下就有数据了,这就是一个storefile [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf -rw-r--r-- 3 root supergroup 4916 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/6a15799391e84e2689969b3eb461330d
此时再插入一些数据,并flush
--再插入两条数据 hbase(main):007:0> put 'test','user1','cf:name','zhang1' hbase(main):010:0> put 'test','user1','cf:name','zhang123' hbase(main):008:0> flush 'test' --查看,已经有三个文件了 [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf -rw-r--r-- 3 root supergroup 4916 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/6a15799391e84e2689969b3eb461330d -rw-r--r-- 3 root supergroup 4917 2020-01-17 13:45 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/9cd91088cea440beafec10548e5a0bae -rw-r--r-- 3 root supergroup 4919 2020-01-17 13:47 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/ff5c2ca27140490e8f9d10c0e3eae491 --scan, hbase(main):012:0> scan 'test' ROW COLUMN+CELL user1 column=cf:name, timestamp=1579240055064, value=zhang123 --可以发现,虽然有cf目录中三个文件了,但是只有一条数据,这就说明hbase是顺序写入的,这样写入的效率比较高, --但是读取的效率就会稍微差了一些(也不是特别差)
4、compact
--对test表执行compact hbase(main):013:0> compact 'test' --此时cf目录中的文件就又合并成一个了 [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf -rw-r--r-- 3 root supergroup 4919 2020-01-17 13:57 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/00a68679a32e4d359f8daeb0be4321c2
5、split
--查看hdfs,59c4ce9f21361f871b64ce86a9a66508就是一个region文件 [root@node1 ~]# hdfs dfs -ls /hbase/data/default/test Found 3 items drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tmp drwxr-xr-x - root supergroup 0 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508 --插入一些数据 hbase(main):018:0> put 'test','user2','cf:name','zhang1' hbase(main):019:0> put 'test','user3','cf:name','zhang1' hbase(main):020:0> put 'test','user4','cf:name','zhang1' hbase(main):021:0> put 'test','user5','cf:name','zhang1' hbase(main):022:0> put 'test','user6','cf:name','zhang1' --split(分割region),以user4(rowkey)为界限(包左不包右), hbase(main):024:0> split 'test','user4' --再查看hdfs,已经把原先的region文件分割为两个了,老的region文件等一会就被自动删除了 [root@node1 ~]# hdfs dfs -ls /hbase/data/default/test Found 5 items drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc drwxr-xr-x - root supergroup 0 2020-01-17 11:23 /hbase/data/default/test/.tmp drwxr-xr-x - root supergroup 0 2020-01-17 14:10 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508 drwxr-xr-x - root supergroup 0 2020-01-17 14:10 /hbase/data/default/test/c4d08147e30b00058ced15b8547260a4 drwxr-xr-x - root supergroup 0 2020-01-17 14:10 /hbase/data/default/test/c6391f28bc8acdcb12bf185dc45596b9
compact和split也可以在hbase的web页面中做;
6、hbase手动compact与split脚本
https://yq.aliyun.com/articles/59591
shell脚本
#!/bin/bash die () { echo >&2 "$@" echo "usage:" echo " $0 check|split table_name [split_size]" exit 1 } [[ "$#" -lt 2 ]] && die "at least 2 arguments required, $# provided" COMMAND=$1 TABLE=$2 SIZE="${3:-1073741824}" split() { region_key=`python /home/hduser/hbase/hbase-scan.py -t hbase:meta -f "RowFilter (=, 'substring:$1')"` echo "split '$region_key'" | hbase shell } if [ "$COMMAND" != "check" ] ; then for region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}` do [[ ${region##*/} =~ ^. ]] && continue [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && split ${region##*/} done # check after split sleep 60 fi for region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}` do [[ ${region##*/} =~ ^. ]] && continue [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a huge region" || echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a small region" done
python脚本
import subprocess import datetime import argparse import csv import gzip import happybase import logging def connect_to_hbase(): return happybase.Connection('itr-hbasetest01') def main(): logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s',level=logging.INFO) argp = argparse.ArgumentParser(description='EventLog Reader') argp.add_argument('-t','--table', dest='table', default='eventlog') argp.add_argument('-p','--prefix', dest='prefix') argp.add_argument('-f','--filter', dest='filter') argp.add_argument('-l','--limit', dest='limit', default=10) args = argp.parse_args() hbase_conn = connect_to_hbase() table = hbase_conn.table(args.table) logging.info("scan start") scanner = table.scan(row_prefix=args.prefix, batch_size=1000, limit=int(args.limit), filter=args.filter) logging.info("scan done") i = 0 for key, data in scanner: logging.info(key) print key i+=1 logging.info('%s rows read in total', i) if __name__ == '__main__': main()