zoukankan      html  css  js  c++  java
  • hiveql basic

    set hive.cli.print.current.db=true;
    set hive.mapred.mode=strict;
    set hive.mapred.mode=nonstrict;
    SHOW PARTITIONS tablename;


    --Dynamic Partition Inserts --by position not by names


    INSERT OVERWRITE TABLE employees
    PARTITION (country, state)
    SELECT ..., se.cnty, se.st
    FROM staged_employees se;

    SET hive.map.aggr=true;

    ----with this way , we can not generate the temporary table


    FROM (
    SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
      round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
      FROM employees
    ) e
    SELECT e.name, e.salary_minus_fed_taxes
    WHERE e.salary_minus_fed_taxes > 70000;

    --When Hive Can Avoid MapReduce


    set hive.exec.mode.local.auto=true;

    --Hive supports the classic SQL JOINstatement, but only equi-joinsare supported.
    --Hive also assumes that the lasttable in the query is the  largest
    --It attempts to buffer the other tables and then stream the last table through
    -- you should structure your join queries so the largest table is last.


    SELECT /*+ STREAMTABLE(s) */ s.ymd, s.symbol, s.price_close, d.dividend
    FROM stocks s JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
    WHERE s.symbol = 'AAPL';

    set hive.auto.convert.join=true;
    hive.mapjoin.smalltable.filesize=25000000;--table size less than this can use in map phase
    SELECT /*+ MAPJOIN(d) */ s.ymd, s.symbol, s.price_close, d.dividend
    FROM stocks s JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
    WHERE s.symbol = 'AAPL';

    set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
    set hive.optimize.bucketmapjoin=true;
    set hive.optimize.bucketmapjoin.sortedmerge=true;

    --Using  DISTRIBUTE BY ... SORT BYor the shorthand  CLUSTER BYclauses is a way to exploit
    --the parallelism of SORT BY, yet achieve a total ordering across the output files.
    --this method is better than use order by (just one reducer);

    --Queries that Sample Data


    SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
    SELECT * FROM numbersflat TABLESAMPLE(0.1 PERCENT) s;--block sampling

    --index


    CREATE INDEX employees_index
    ON TABLE employees (country)
    AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
    WITH DEFERRED REBUILD;

    Looking for a job working at Home about MSBI
  • 相关阅读:
    A+B for Input-Output Practice (VIII)
    A+B for Input-Output Practice (VI)
    A+B for Input-Output Practice (VII)
    A+B for Input-Output Practice (IV)
    1.1.4 A+B for Input-Output Practice (V)
    1.1.3 A+B for Input-Output Practice (III)
    基础练习 龟兔赛跑预测
    基础练习 回形取数
    Python实用黑科技——以某个字段进行分组
    Python黑科技神奇去除马赛克
  • 原文地址:https://www.cnblogs.com/huaxiaoyao/p/4361736.html
Copyright © 2011-2022 走看看