zoukankan      html  css  js  c++  java
  • hive中使用rcfile

      (1)建student & student1 表:(hive 托管)
    create table student(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',';

    create table studentrc(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',' stored as rcfile;

    create table studentlzo(id INT, age INT, name STRING)
    partitioned by(stat_date STRING)
    clustered by(id) sorted by(age) into 4 buckets
    row format delimited fields terminated by ',' stored as rcfile;

    文件格式 textfile, sequencefile, rcfile
    (2)设置环境变量:
    set hive.enforce.bucketing = true;
    (3)插入数据:
      LOAD DATA local INPATH '/home/hadoop/hivetest1.txt' OVERWRITE INTO TABLE student partition(stat_date="20120802");


    (CPU使用率很高)
    from student
    insert overwrite table student1 partition(stat_date="20120802")
    select id,age,name where stat_date="20120802" sort by age;

    查看数据
    select id, age, name from student  distribute by id ; // distribute相当于mapreduce中的key


    抽选数据(一般测试的情况下使用)
    select * from student tablesample(bucket 1 out of 2 on id);
    TABLESAMPLE(BUCKET x OUT OF y)
    其中, x必须比y小, y必须是在创建表的时候bucket on的数量的因子或者倍数, hive会根据y的大小来决定抽样多少, 比如原本分了32分, 当y=16时, 抽取32/16=2分, 这时TABLESAMPLE(BUCKET 3 OUT OF 16) 就意味着要抽取第3和第16+3=19分的样品. 如果y=64, 这要抽取 32/64=1/2份数据, 这时TABLESAMPLE(BUCKET 3 OUT OF 64) 意味着抽取第3份数据的一半来进行.

    rcfile操作

    // 导入(gzip压缩)
    set hive.enforce.bucketing=true;
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    from student
    insert overwrite table studentrc partition(stat_date="20120802")  
    select id,age,name where stat_date="20120802" sort by age;


    // lzo压缩
    set hive.io.rcfile.record.buffer.size = 16777216; // 16 * 1024 * 1024
    set io.file.buffer.size = 131072; // 缓冲区大小 128 * 1024

    set hive.enforce.bucketing=true;
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;  
    set io.compression.codecs=com.hadoop.compression.lzo.LzoCodec;  
    from student
    insert overwrite table studentlzo partition(stat_date="20120802")  
    select id,age,name where stat_date="20120802" sort by age;

    // sequencefile导入
    set hive.exec.compress.output=true;  
    set mapred.output.compress=true;  
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    insert overwrite table studentseq select * from student;

  • 相关阅读:
    二叉树的存储结构
    面试Java需要的知识总结
    EJB总结
    WEB 容器、WEB服务和应用服务器的区别与联系
    Linux安装JBOSS
    JBOSS和WebLogic区别
    深入浅出JMS(一)--JMS基本概念
    Java缓冲流细节
    xor和路径(codevs 2412)
    外星千足虫(bzoj 1923)
  • 原文地址:https://www.cnblogs.com/chengxin1982/p/3981954.html
Copyright © 2011-2022 走看看