zoukankan      html  css  js  c++  java
  • hive中使用rcfile

      (1)建student & student1 表:(hive 托管)
    create table student(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',';

    create table studentrc(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',' stored as rcfile;

    create table studentlzo(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',' stored as rcfile;

    文件格式 textfile, sequencefile, rcfile
    (2)设置环境变量:
    set hive.enforce.bucketing = true;
    (3)插入数据:
      LOAD DATA local INPATH '/home/hadoop/hivetest1.txt' OVERWRITE INTO TABLE student partition(stat_date="20120802");


    (CPU使用率很高)
    from student
    insert overwrite table student1 partition(stat_date="20120802")
    select id,age,name where stat_date="20120802" sort by age;

    查看数据
    select id, age, name from student  distribute by id ; // distribute相当于mapreduce中的key


    抽选数据(一般测试的情况下使用)
    select * from student tablesample(bucket 1 out of 2 on id);
    TABLESAMPLE(BUCKET x OUT OF y)
    其中, x必须比y小, y必须是在创建表的时候bucket on的数量的因子或者倍数, hive会根据y的大小来决定抽样多少, 比如原本分了32分, 当y=16时, 抽取32/16=2分, 这时TABLESAMPLE(BUCKET 3 OUT OF 16) 就意味着要抽取第3和第16+3=19分的样品. 如果y=64, 这要抽取 32/64=1/2份数据, 这时TABLESAMPLE(BUCKET 3 OUT OF 64) 意味着抽取第3份数据的一半来进行.

    rcfile操作

    // 导入(gzip压缩)
    set hive.enforce.bucketing=true;
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    from student
    insert overwrite table studentrc partition(stat_date="20120802")  
    select id,age,name where stat_date="20120802" sort by age;


    // lzo压缩
    set hive.io.rcfile.record.buffer.size = 16777216; // 16 * 1024 * 1024
    set io.file.buffer.size = 131072; // 缓冲区大小 128 * 1024

    set hive.enforce.bucketing=true;
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;  
    set io.compression.codecs=com.hadoop.compression.lzo.LzoCodec;  
    from student
    insert overwrite table studentlzo partition(stat_date="20120802")  
    select id,age,name where stat_date="20120802" sort by age;

    // sequencefile导入
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    insert overwrite table studentseq select * from student;

  • 相关阅读:
    【面积并】 Atlantis
    【动态前k大 贪心】 Gone Fishing
    【复杂枚举】 library
    【双端队列bfs 网格图建图】拯救大兵瑞恩
    【奇偶传递关系 边带权】 奇偶游戏
    【权值并查集】 supermarket
    CF w4d3 A. Pythagorean Theorem II
    CF w4d2 C. Purification
    CF w4d2 B. Road Construction
    CF w4d2 A. Cakeminator
  • 原文地址:https://www.cnblogs.com/chengxin1982/p/3981954.html
Copyright © 2011-2022 走看看