zoukankan      html  css  js  c++  java
  • hive-分析函数

    bitmap
    --------------
        位图/位映射。
        5
        
    hive
    --------------
        分区表
        udtf函数
        wordcount
        lateral view            //和udtf配合使用。
        order by                //数据倾斜
        sort by                    //reduce内排序
        distribute by            //分区
        cluster by                //sort by + distribute by
    
    hive
    --------------
        --显式表头
        hive>set hive.cli.print.header=true ;
    
    hive分析函数
    --------------
        1.准备数据
            create table emp
            (
                id int ,
                empno string ,
                age int ,
                salary float ,
                deptno int 
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE;
    
        2.加载数据
            1,tom1,28,3000,1
            2,tom2,29,5000,2
            3,tom3,30,5400,1
            4,tom4,28,6600,3
            5,tom5,27,8000,2
            6,tom6,35,10000,3
    
        3.加载数据
            load data local inpath '/home/centos/emp.txt' into table emp ;
    
    hive分区函数应用
    -----------------
        0.简介
            hive分析函数可以连接每条记录形成数据集,字段甚至可以不是分组字段,使用一次mr
            完成聚合查询。
            常规的sql下的分组聚合很诸多限制,其中select字段必须是分组字段,有时需要多次mr.
            select deptno , max(salary) from emp group by deptno ;
        1.分析函数
            -- over , 只是分区
            SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp  ;
    
    
            -- OVER + ORDER BY , 分区并在分区内排序
            SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp;
    
            -- OVER ... rows unbounded preceding 基于前导所有行的汇总操作。
            SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ;
    
            -- RANK, 排名操作 ,计算每个部门内按照工资的降序进行排名(有缝,并列情况存在缝隙)绝对排名。
            SELECT empno, deptno, salary,  RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
            
            -- dense_rank()密度排名,无缝。绝对排名。
            SELECT empno, deptno, salary,  Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
    
            -- percent_rank()百分比排名,相对排名.
            SELECT empno, deptno, salary,  percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
    
            --NTILE(n) ,分桶操纵,将数据均匀分散到各个桶中。
            SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            -- lead()前导操作,查询从当前开始,后续第几行的操作。
            SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --lag,从当前行计数,访问之前的第几行salary,如果超过窗口范围返回null。
            SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --first_value()
            SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
    
            --last_value()
            SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
            
            --使用range开窗函数 RANGE BETWEEN ... AND ...,在分区内在划分记录范围。
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
    
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
            
            --RANGE : 对值的+/-.
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
    
            --range : 计算的是值
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ;
            //rows计算的行
            SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ;
    
            --窗口重用
            SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ;
    
            [开窗函数]
            range|rows between ... and ;
            range是值偏移,rows是行偏移。
    
        2.统计员工数量
            select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ;
    
    
    hive高级聚合
    ---------------
        1.grouping set
            作用等同于union.
            select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ;
        3.rollup
            select ... from ... GROUP BY a,b,c WITH ROLLUP ;
            select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),())
    
            select deptno , age , count(1) from emp group by deptno,age with rollup ;
        4.cube
            select ... from ... GROUP BY a,b,c WITH cube ;
            select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),()))
    
            select deptno , age , count(1) from emp group by deptno,age with cube ;
    
    hive优化
    -------------------
        1.创建索引
            本质上就是表,对于检索单条记录是有优势的。
            排序的。
            --创建索引
            CREATE INDEX idx_emp ON TABLE emp (empno) AS 'COMPACT' WITH DEFERRED REBUILD;
    
            --生成索引
            ALTER INDEX idx_emp ON emp REBUILD;
    
            --查询是通过索引列查询
            select * from emp where empno = 'tom1'
    
        2.文件格式
            [列存储格式]
            parquet , orcfile , orc    //
            投影查询时,发挥磁盘的线性读写。
            select id,name from emp ;
            //创建表,存储成parquet格式
            create table pq1(id int , empno string, age int ,salary float ,  deptno int ) stored as parquet ;
            
            //查看文件格式
            insert into pq1 select * from emp ;
            [行存储]
            txt 
    
        3.压缩
            减少数据传输量,降低网络IO的负载。
            --在多级job中,job之间的结果是否需要压缩。
            SET hive.exec.compress.intermediate=true ;
            SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ;
            SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ;
            SET hive.intermediate.compression.type=record|block|none ;
    
            --控制job的最终输出是否压缩.
            SET hive.exec.compress.output=true;
            SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec;
        
        4.大量小文件
            导致map过多。
            1.归档
                hadoop Archive and HAR进行归档。
    
            2.转换成序列文件
                将小文件合成SeqenceFile.
    
            3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
                alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
                --修改默认输入格式
                set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ;
    
                --建新表
    create table emp2
    (
    id int ,
    empno string ,
    age int ,
    salary float ,
    deptno int 
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS
    INPUTFORMAT 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
  • 相关阅读:
    剑指offer-二维数组中的查找
    TF-IDF(term frequency–inverse document frequency)
    Java实现中文字符串的排序功能
    当前课程
    【R】资源整理
    CentOS相关
    【转】Setting up SDL Extension Libraries on MinGW
    【转】Setting up SDL Extension Libraries on Visual Studio 2010 Ultimate
    【转】Setting up SDL Extension Libraries on Code::Blocks 12.11
    【转】Setting up SDL Extension Libraries on Visual Studio 2019 Community
  • 原文地址:https://www.cnblogs.com/zyde/p/9225353.html
Copyright © 2011-2022 走看看