zoukankan      html  css  js  c++  java
  • hbase禁用掉自动split后如何维护

    1、创建hbase表

    --建表
    hbase(main):003:0> create 'test','cf'
    
    --查看hdfs目录,此时cf目录下没有数据
    [root@node1 ~]# hadoop fs -ls /hbase/data/default/test
    Found 3 items
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tmp
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508
    
    [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf


    2、插入数据

    --插数据
    hbase(main):005:0> put 'test','user1','cf:name','zhang'
    
    --再看cf目录,依然没有数据
    [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf


    3、flush

    --flush
    hbase(main):006:0> flush 'test'
    
    --此时cf目录下就有数据了,这就是一个storefile
    [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf
    -rw-r--r--   3 root supergroup       4916 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/6a15799391e84e2689969b3eb461330d


    此时再插入一些数据,并flush

    --再插入两条数据
    hbase(main):007:0> put 'test','user1','cf:name','zhang1'
    hbase(main):010:0> put 'test','user1','cf:name','zhang123'
    
    hbase(main):008:0> flush 'test'
    
    --查看,已经有三个文件了
    [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf
    -rw-r--r--   3 root supergroup       4916 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/6a15799391e84e2689969b3eb461330d
    -rw-r--r--   3 root supergroup       4917 2020-01-17 13:45 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/9cd91088cea440beafec10548e5a0bae
    -rw-r--r--   3 root supergroup       4919 2020-01-17 13:47 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/ff5c2ca27140490e8f9d10c0e3eae491
    
    --scan,
    hbase(main):012:0> scan 'test'
    ROW                                          COLUMN+CELL                                                                                                                    
     user1                                       column=cf:name, timestamp=1579240055064, value=zhang123 
    
    
    --可以发现,虽然有cf目录中三个文件了,但是只有一条数据,这就说明hbase是顺序写入的,这样写入的效率比较高,
    --但是读取的效率就会稍微差了一些(也不是特别差)


    4、compact

    --对test表执行compact
    hbase(main):013:0> compact 'test'
    
    
    --此时cf目录中的文件就又合并成一个了
    [root@node1 ~]# hdfs dfs -ls -R /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf
    -rw-r--r--   3 root supergroup       4919 2020-01-17 13:57 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508/cf/00a68679a32e4d359f8daeb0be4321c2


    5、split

    --查看hdfs,59c4ce9f21361f871b64ce86a9a66508就是一个region文件
    [root@node1 ~]# hdfs dfs -ls /hbase/data/default/test
    Found 3 items
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tmp
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:28 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508
    
    
    --插入一些数据
    hbase(main):018:0> put 'test','user2','cf:name','zhang1'
    
    hbase(main):019:0> put 'test','user3','cf:name','zhang1'
    
    hbase(main):020:0> put 'test','user4','cf:name','zhang1'
    
    hbase(main):021:0> put 'test','user5','cf:name','zhang1'
    
    hbase(main):022:0> put 'test','user6','cf:name','zhang1'
    
    
    --split(分割region),以user4(rowkey)为界限(包左不包右),
    hbase(main):024:0> split 'test','user4'
    
    
    --再查看hdfs,已经把原先的region文件分割为两个了,老的region文件等一会就被自动删除了
    [root@node1 ~]# hdfs dfs -ls /hbase/data/default/test
    Found 5 items
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tabledesc
    drwxr-xr-x   - root supergroup          0 2020-01-17 11:23 /hbase/data/default/test/.tmp
    drwxr-xr-x   - root supergroup          0 2020-01-17 14:10 /hbase/data/default/test/59c4ce9f21361f871b64ce86a9a66508
    drwxr-xr-x   - root supergroup          0 2020-01-17 14:10 /hbase/data/default/test/c4d08147e30b00058ced15b8547260a4
    drwxr-xr-x   - root supergroup          0 2020-01-17 14:10 /hbase/data/default/test/c6391f28bc8acdcb12bf185dc45596b9


    compact和split也可以在hbase的web页面中做;


    6、hbase手动compact与split脚本

    https://yq.aliyun.com/articles/59591

    shell脚本

    #!/bin/bash
    
    die () {
        echo >&2 "$@"
        echo "usage:"
        echo "       $0 check|split table_name [split_size]"
        exit 1
    }
    
    [[ "$#" -lt 2 ]] && die "at least 2 arguments required, $# provided"
    
    COMMAND=$1
    TABLE=$2
    SIZE="${3:-1073741824}"
    
    split() {
        region_key=`python /home/hduser/hbase/hbase-scan.py -t hbase:meta -f "RowFilter (=, 'substring:$1')"`
        echo "split '$region_key'" | hbase shell
    }
    
    if [ "$COMMAND" != "check" ] ; then
        for region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}`
        do
            [[ ${region##*/} =~ ^. ]] && continue
            [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && split ${region##*/}
        done
    
        # check after split
        sleep 60
    fi
    
    for region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}`
    do
        [[ ${region##*/} =~ ^. ]] && continue
        [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a huge region" || echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a small region"
    done


    python脚本

    import subprocess
    import datetime
    import argparse
    import csv
    import gzip
    import happybase
    import logging
    
    def connect_to_hbase():
        return happybase.Connection('itr-hbasetest01')
    
    def main():
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s',level=logging.INFO)
    
        argp = argparse.ArgumentParser(description='EventLog Reader')
        argp.add_argument('-t','--table', dest='table', default='eventlog')
        argp.add_argument('-p','--prefix', dest='prefix')
        argp.add_argument('-f','--filter', dest='filter')
        argp.add_argument('-l','--limit', dest='limit', default=10)
    
        args = argp.parse_args()
    
        hbase_conn = connect_to_hbase()
    
        table = hbase_conn.table(args.table)
        logging.info("scan start")
        scanner = table.scan(row_prefix=args.prefix, batch_size=1000, limit=int(args.limit), filter=args.filter)
        logging.info("scan done")
        i = 0
        for key, data in scanner:
            logging.info(key)
            print key
            i+=1
    
        logging.info('%s rows read in total', i)
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    对Java课程的感想
    OO第二阶段总结
    OO第一作业周期(前四周)总结
    实验7 流类库和输入输出
    实验6 类的继承和多态
    实验5 类和对象3
    实验4 类与对象2
    实验3 类和对象
    实验2
    实验1
  • 原文地址:https://www.cnblogs.com/weiyiming007/p/12205719.html
Copyright © 2011-2022 走看看