zoukankan      html  css  js  c++  java
  • hive高阶函数和采样-优化

     select a.id, a.month from user_b a union all select b.id, b.day from user_b b;
    桶表:和hash partition类似
    =============================================
        bucket    //clustered(id) into 2 buckets
        
        为了避免分区表过多产生海量文件夹
        文件段形式存在
    
    
    分区+分桶表插入问题
        
        insert into xx partition(''='',''='') select 1,'tom',20;
    
        000000        //第一桶
        000001        //第二桶
    
        select size(arr)
        select array(1,2,3)        //创建array
        select map(1,'tom',2,'tomas')    //创建map
        select struct(1,'tom','male')    //创建匿名结构体 ['col1':1,'col2':'tom' ...]
        select named_struct('id',1,'name','tom','sex','male')    //创建带名结构体
    
        select current_date();        //当前日期
        select current_timestamp();    //当前时间(精确到毫秒)
    
        select date_format(current_date(),'yyyyMMdd')    //日期转换
        select date_format(current_timestamp(),'yyyyMMdd')    //日期转换
    
        select from_unixtime(bigint,'yyyyMMdd')        //将时间戳转换成时间
    
    
    hive的文件格式:stored as parquet;
    ===========================================
    插入方法:创建text,然后insert into  xx select * from
    
        行级存储
            text
            seqFile        1M
    
        列级存储    在进行投影查询的时候,会跳过不相关的列
            rcFile        4M块
            orcFile        能够支持更大的块256M
            parquet        支持更多hadoop生态圈组件    
    
        
    
    SerDe:
    =========================================
        serialize && deserialize
    
        将文件字段映射成hive表中的列
        
        使用阶段:在inputFormat之后
    
        textFile:lazySimpleSerde
              openCsvSerde
              JsonSerde        //创建时字段务必与json中的key对应
        
        创建表指定serde:
            row format serde 'org.xxx.xxx.JsonSerde';
    
        在hive字段中以特殊字符开头的字段
            create table xx(`_location` string);
    
    
    分析函数:
    =================================
        sum()over(sort by age rows between unbounded preceding and unbounded following);    //排序并界定窗口
        sum()over(sort by age range between unbounded preceding and unbounded following);
    
        sum()over(partition by province order by age range between unbounded preceding and unbounded following);    //分组排序+界定窗口
        
        current row    //当前行
    
    Hive的分组方式
    
        row_number() 是没有重复值的排序(即使两天记录相等也是不重复的),可以利用它来实现分页
        dense_rank() 是连续排序,两个第二名仍然跟着第三名
        rank()       是跳跃排序的,两个第二名下来就是第四名
        -- percent_rank()百分比排名,相对排名.
            SELECT empno, deptno, salary,  percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
        
        rank()        //并列跳跃    113
        dense_rank()    //不跳跃    112
        row_number()    //顺序        123
    
        
        ntile(n)    //三六九等    
    
        first_value()    //取第一个值
    
        lead()        //窗口上浮
    
        //连续两个月活跃
        select id , m , n from 
        (select id, (month+1) as m , lead(month)over(partition by id order by month) as n from user_c) a 
        where  a.m=a.n;
        //select id ,m ,n from(select id m(month+1)as m , lead(month) over(partition by id order by month) as n from user_c )a where a.m=a.n;
        //连续三个月活跃
        select distinct id from (select id, (month+2) as first , (lead(month)over(partition by id order by month)+1) 
        as second, lead(month,2)over(partition by id order by month) as third from user_c) a 
        where a.first=a.second and a.second=a.third;
        
        //select distinct id from (select id ,(month+2) as first,(lead(month)over(partition by id order by month)+1) 
        //as second,lead(month,2)over (partition by id order by month)as third from user_c) a where a.first = a.second and a.second=a.third;
    
        lag
    
    
        pv:page view    //页面浏览量,统计总浏览数
        uv:user view    //用户浏览数,统计用户数
    
        根据pv,统计uv:
        select month,day, count(distinct id) as uv from user_b group by month,day;
    
    
    高级聚合函数:**********************
    
        grouping__id    //组号
        
        grouping sets    组集    //分别统计月活和日活
        select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
        grouping sets(month,day) order by grouping__id ;
    
        //select month,day count(distinct id) as uv grouping_id from user_b group by month,day grouping sets(month,day)order by grouping_id;
        
        +----------+-------------+-----+---------------+--+
        |  month   |     day     | uv  | grouping__id  |
        +----------+-------------+-----+---------------+--+
        | 2015-04  | NULL        | 6   | 1             |
        | 2015-03  | NULL        | 5   | 1             |
        | 2015-02  | NULL        | 2   | 1             |
        | NULL     | 2015-04-16  | 2   | 2             |
        | NULL     | 2015-04-15  | 2   | 2             |
        | NULL     | 2015-04-13  | 3   | 2             |
        | NULL     | 2015-04-12  | 2   | 2             |
        | NULL     | 2015-03-12  | 1   | 2             |
        | NULL     | 2015-03-10  | 4   | 2             |
        | NULL     | 2015-02-16  | 2   | 2             |
        +----------+-------------+-----+---------------+--+
    
    
    
        rollup        汇总
        select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
        with rollup order by grouping__id ;
        //select month,day count (distinct id )as uv grouping_id from user_b group by month,day with rollup order by grouping_id;
        
    
        +----------+-------------+-----+---------------+--+
        |  month   |     day     | uv  | grouping__id  |
        +----------+-------------+-----+---------------+--+
        | NULL     | NULL        | 7   | 0             |
        | 2015-04  | NULL        | 6   | 1             |
        | 2015-03  | NULL        | 5   | 1             |
        | 2015-02  | NULL        | 2   | 1             |
        | 2015-04  | 2015-04-16  | 2   | 3             |
        | 2015-04  | 2015-04-15  | 2   | 3             |
        | 2015-04  | 2015-04-13  | 3   | 3             |
        | 2015-04  | 2015-04-12  | 2   | 3             |
        | 2015-03  | 2015-03-12  | 1   | 3             |
        | 2015-03  | 2015-03-10  | 4   | 3             |
        | 2015-02  | 2015-02-16  | 2   | 3             |
        +----------+-------------+-----+---------------+--+
    
        select day,month count(distinct id) as uv, grouping__id from user_b group by day,month
        with rollup order by grouping__id ;    
        //selectt day ,month count(distinct id ) as uv,grouping_id form user_b group by day,month with rollup order by grouping_id;
        
        +-------------+----------+-----+---------------+--+
        |     day     |  month   | uv  | grouping__id  |
        +-------------+----------+-----+---------------+--+
        | NULL        | NULL     | 7   | 0             |
        | 2015-04-16  | NULL     | 2   | 2             |
        | 2015-04-15  | NULL     | 2   | 2             |
        | 2015-04-13  | NULL     | 3   | 2             |
        | 2015-04-12  | NULL     | 2   | 2             |
        | 2015-03-12  | NULL     | 1   | 2             |
        | 2015-03-10  | NULL     | 4   | 2             |
        | 2015-02-16  | NULL     | 2   | 2             |
        | 2015-04-16  | 2015-04  | 2   | 3             |
        | 2015-04-13  | 2015-04  | 3   | 3             |
        | 2015-04-12  | 2015-04  | 2   | 3             |
        | 2015-03-12  | 2015-03  | 1   | 3             |
        | 2015-03-10  | 2015-03  | 4   | 3             |
        | 2015-02-16  | 2015-02  | 2   | 3             |
        | 2015-04-15  | 2015-04  | 2   | 3             |
        +-------------+----------+-----+---------------+--+
        
    
        cube        魔方
        select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
        with cube order by grouping__id ;
        
        //select month,day ,count(distinct id) as uv ,grouping_id from user_b group by month,day with cube order by grouping_id; 
    
        2    1, 2, [1,2]
        3    1,2,3, [1,2],[1,3],[2,3],[1,2,3]
        4
        
        +----------+-------------+-----+---------------+--+
        |  month   |     day     | uv  | grouping__id  |
        +----------+-------------+-----+---------------+--+
        | NULL     | NULL        | 7   | 0             |
        | 2015-02  | NULL        | 2   | 1             |
        | 2015-04  | NULL        | 6   | 1             |
        | 2015-03  | NULL        | 5   | 1             |
        | NULL     | 2015-02-16  | 2   | 2             |
        | NULL     | 2015-04-16  | 2   | 2             |
        | NULL     | 2015-04-15  | 2   | 2             |
        | NULL     | 2015-04-13  | 3   | 2             |
        | NULL     | 2015-04-12  | 2   | 2             |
        | NULL     | 2015-03-12  | 1   | 2             |
        | NULL     | 2015-03-10  | 4   | 2             |
        | 2015-04  | 2015-04-12  | 2   | 3             |
        | 2015-03  | 2015-03-12  | 1   | 3             |
        | 2015-03  | 2015-03-10  | 4   | 3             |
        | 2015-04  | 2015-04-16  | 2   | 3             |
        | 2015-02  | 2015-02-16  | 2   | 3             |
        | 2015-04  | 2015-04-15  | 2   | 3             |
        | 2015-04  | 2015-04-13  | 3   | 3             |
        +----------+-------------+-----+---------------+--+
    
    
    
    
    cookie1    1
    cookie2    5
    cookie2    4
    cookie1    3
    cookie2    7
    cookie1    4
    cookie2    2
    cookie3    2
    cookie2    3
    cookie3    5
    cookie1    6
    cookie3    10
    cookie2    8
    
    
    hive的事务性:
    ===================================
        1、配置文件
        SET hive.support.concurrency=true; 
        SET hive.enforce.bucketing=true; 
        SET hive.exec.dynamic.partition.mode=nonstrict;
        SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; 
        SET hive.compactor.initiator.on=true; 
        SET hive.compactor.worker.threads=1;
    
    
        2、桶表
        
        3、指定事务开启
    
        4、orc文件
        create table user3(id int, name string, age int, province string, city string) 
        clustered by(province) into 2 buckets
        row format delimited
        fields terminated by '	'
        stored as orc 
        tblproperties('transactional'='true');
    
    
        5、insert into user3 select * from user1;
    
        
        6、update user3 set age=200 where id=5;
    
    
    采样:取出大型数据集的子集
    ============================================
        随机采样:
            select * from user2 distribute by rand() sort by rand() limit 10;
            
            //select * from user2 distribite by rand() sort rand() limit 10;
            
            随机模式分发+ 随机模式排序
    
    
        桶表采样:
            对于桶表的采样优化
            select name from user1 TABLESAMPLE(bucket 1<指定桶数> out of 4<总桶数> on rand()) ;
            //select name from user1 tablesample(bucket 1 out of 4 on rand());
    
            随机取得某个桶的数据,
    
            create table user1_bucket(id int, name string, age int, province string, city string) 
            clustered by(province) into 4 buckets
            row format delimited fields terminated by '	';
    
            insert into user1_bucket select * from user1;
    
    
    1    tom    30    anhui    anhui
    2    tomas    30    hefei
    1    tom    30    heilongjiang
    2    tomas    30    jilin 
    1    tom    30    liaoning
    2    tomas    30    neimenggu
    1    tom    30    shandong
    2    tomas    30    shanxi
    1    tom    30    qinghai
    2    tomas    30    jiangsu
    1    tom    30    gansu
    2    tomas    30    ningxia
    1    tom    30    hubei
    2    tomas    30    sichuan
    1    tom    30    chongqing
    2    tomas    30    taiwan
    1    tom    30    xianggang
    2    tomas    30    guangdong
    
    
    
        块采样:可以指定文件大小、行数、百分比进行数据的采样
            select * from users TABLESAMPLE(5 percent);
            select * from users TABLESAMPLE(30 M);
            select * from users TABLESAMPLE(2 rows);
    
    
    用户定义函数
    =====================
        1、UDF        //单行转单行    add(id,name) ===> idname
                //处理单位一行,返回一行
    
           入口点:UDF类
    
        2、UDTF        //单行转多行    explode(arr) ===> arr[1]
                                  arr[2]
                                  arr[3]
    
        
        3、UDAF        //多行转单行    sum(age)     ===> 500
    
    
    编写hive自定义函数:
    ==========================================
        1、pom文件
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-exec</artifactId>
                <version>2.1.1</version>
            </dependency>
    
        2、编写代码
    
            注意:对于hive的UDF,不支持变长参数,但是支持list
            @Description(
                name = "add",
                value = "this is an add function",
                extended = "select add(1,2,3) ====> 6 ; select add('hello','world') ====> helloworld"
            )
            public class UDFAdd extends UDF{
                /**
                 * 将所有i进行相加
                 * @param i
                 * @return
                 */
                public Integer evaluate(int i, int j){
                return i+ j;
                }
                public String evaluate(String i, String j){
                return i+ j;
                }
                public int evaluate(List<Integer> i){
                int j = 0;
                for (Integer integer : i) {
                    j += integer;
                }
                return j;
                }
    
    
    
            }
    
    
    
        3、加载并使用jar
            第一个方法:打包并将其复制到hive的lib文件夹            //hive的默认类路径
            第二个方法:add jar /soft/hive/lib/myhive-1.0-SNAPSHOT.jar;    //手动指定hive类路径
            第三个方法:修改配置文件:hive-site.xml                //hive.aux.jars.path=/x/x/x.jar
    
    
        4、将类加载成hive的函数
            创建函数[临时函数]
                beeline> create temporary function add as  'com.oldboy.hive.UDFAdd';
    
            [永久函数]
                beeline> create function add as 'com.oldboy.hive.UDFAdd'
                     using jar 'hdfs:///path/to/x.jar';
    
                该方式非常重要,完全分布式时需要使用该方式,否则找不到函数类。
    
        
        5、删除方法:
            drop temporary function add ;
    
    
    
    1、将商家的标签,通过udf进行解析并返回标签
    
        json ==== udf + fastJson ===> {'味道好','服务好'}
    
        如何在hive中使用fastJson?    //将其拷贝到hive的lib文件夹并重启hive
    
    
        1)编写程序并打包程序
        2)将jar和fastJson文件传送到hive的lib文件夹
        3)添加临时函数create temporary function parseJson as  'com.oldboy.hive.TempTagUDF';
        4)创建temptag表,字段包括id和json,以'	'作为分隔
        5)将TempTag.txt加载到hive的temptag表中
        6)使用udf对商家评论进行操作
        7)统计各个商家去重之后的标签数
            select  id, count(distinct tag)  from  temptags lateral view explode(parseJson(json)) xx  as tag group by id;
        8)统计各个商家各个标签的数量
            select id,  tag, count(tag) from temptags lateral view explode(parseJson(json)) xx  as tag group by id,tag ;
    
    
        !!!!出现问题:类找不到异常
            Caused by: java.lang.ClassNotFoundException: com.oldboy.hive.TempTagUDF
    
            原因分析:hive在使用MR操作时,在其他节点接受不到TempTagUDF类所在jar
    
            解决:将udf的jar和fastjson一同复制到${HADOOP_HOME}/share/hadoop/common/lib下,并同步到其他节点
                  不需要重启hadoop
    
    
    maven项目设置编译器为jdk1.8(默认1.5)    //放在<project>标签下
        <build>
            <plugins>
                <plugin>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
                    </configuration>
                </plugin>
            </plugins>
        </build>
    
    
    
    虚列:
    ======================    
        > select name, explode(work_place) from default.employee;    //udtf不支持外部的select从句
        UDTF's are not supported outside the SELECT clause, nor nested in expressions (state=42000,code=10081)
    
    
        lateral view    //对表生成函数的补充
    
        select id, tag from temptags lateral view explode(parseJson(json)) xx as tag ;
              //虚列
              //lateral view explode(parseJson(json)) xx as tag    (xx无意义,占位符)
    
    
        select id , explode()
    
    
    
    UDTF:
    ===================================
        StructObjectInspector    类
        ObjectInspector[]
    
        create temporary function wc as  'com.oldboy.hive.TestUDTF';
    
        LazySimpleSerDe中的lazy格式,默认string格式,只有在使用或者声明格式的时候才进行转换
    
    
    
    hive优化:
    =====================================
        性能工具:
            EXPLAIN        //解释hive运行过程中MR作业整体流程
                    //explain select count(1) from wc;
    
            Analyze        //在下一次执行的时候使用CBO(cost-based-optimize)基于成本的优化来执行作业
                    //analyze table wc compute STATISTICS
                    //desc formatted wc    =====> 能看到文件行数统计以及文件大小
    
    
        设计优化:
            分区表:    //以日期+时间、以location、以业务逻辑为分区字段,优化分区(对where子句查询的优化)
                    //create table xx() partition by (province string, city string);
    
            分桶表:    //可以进行采样、对join的优化比较好(分桶字段如果和join字段一致,
                    //在join操作时会选择分区内部的桶文件段,避免了全文件扫描)
                    //create table xx() clustered by(province) into 2 buckets;
    
            创建索引:    // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)
                       AS 'COMPACT' WITH DEFERRED REBUILD;        //创建compact索引
    
                    // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)     
                       AS 'BITMAP' WITH DEFERRED REBUILD;        //创建位图索引
    
    
            执行引擎优化:    //hive2中已经不推荐使用mr作为执行引擎
                    //推荐使用spark、tez作为执行引擎
    
        数据文件格式优化:、
    
            数据格式:
                text
                seqFile
                RCFile
                ORCFile
                Parquet
    
            压缩(defaultgziplzozip2lz4snappy)
                中间数据压缩:    //分担网络间分发压力和磁盘存储压力
                        //set hive.exec.compress.intermidiate=true
                        //set hive.intermidiate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec
                
                输出文件压缩    //减小磁盘存储压力
                        //set hive.exec.compress.output=true
                        //set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec
    
            存储优化:
                1、使用har文件
                2、使用SeqFile格式
                3、使用CombineFileInputFormat格式    //hive自动优化手段,将多个小文件合并为单个文件
                4、使用hdfs联邦                //在其他节点存储namenode数据,水平扩展其容量(不实用)
    
                5、配置文件:
                    set hive.merge.mapfiles=true    //仅map任务时合并输出的小文件
                    set hive.merge.mapredfiles=true    //MR作业时合并输出小文件
                    set hive.merge.size.per.task=256000000    //定义job合并文件的大小
                    set hive.merge.smallfiles.avgsize=16000000    //定义合并小文件的触发阈值
    
        作业和查询优化:
            使用本地模式:
                beeline本地模式(不用开启hive2):beeline -u jdbc:hive2:// -n centos
                hive本地模式触发条件:
                    SET hive.exec.mode.local.auto=true;            //本地模式自动进入
                    SET hive.exec.mode.local.auto.inputbytes.max=50000000;    //自动进入本地模式的输入阈值,超过则退出本地模式
                    SET hive.exec.mode.local.auto.input.files.max=5;    //自动进入本地模式的文件个数阈值,超过则退出本地模式
    
            jvm重用:仅适用于MR一代,yarn不适用
                MR作业会共享jvm虚拟机内存而非全部并行计算
                    set mapred.job.reuse.jvm.num.tasks=5
    
    
    
            并行计算:多用于分布式作业,之在不同主机中同时进行同一作业的处理
            并发执行:多用于线程
    
                   在hive作业执行时,多个stage之间不一定相互依赖,在此时可以设置并行执行
                   set hive.exec.parellel=true;
                   set hive.exec.parellel.thread.number=16;
    
    
            join优化:**********
                Common join    //reduce 端的join
                        //通过暗示指定大表/*+ STREAMTABLE(bigtable) */
                    
                Map join    //map 端
                        //通过暗示指定小表/*+MAP JOIN(smalltable) */
                        SET hive.auto.convert.join=true; 
                        --default false
                        SET hive.mapjoin.smalltable.filesize=600000000;
                        --default 25M 超过此值,使用reduce端join
                        SET hive.auto.convert.join.noconditionaltask=true;
                        --default false. true说明不需要暗示
                        SET hive.auto.convert.join.noconditionaltask.size=10000000;
                        --控制表大小和内存的适配
    
                桶表join优化:    //SET hive.auto.convert.join=true; --default false
                        //SET hive.optimize.bucketmapjoin=true; --default false
    
                    
                !!!!!!!!join端数据倾斜处理
                        //SET hive.optimize.skewjoin=true;     进行负载均衡
                          SET hive.skewjoin.key=100000;    //在reduce中如果一个reduce接受的数据超过此值,会自动发送给空闲的reduce
    
                        
                        
    
            group by优化
                !!!!!!!!group by数据倾斜处理
                SET hive.groupby.skewindata=true;
                
                
                
                
                 
    1、请把下一语句用hive方式实现?
    
    SELECT a.key,a.value 
    FROM a 
    WHERE a.key not in (SELECT b.key FROM b)
    答案:
    select a.key,a.value from a where a.key not exists (select b.key from b)
    
    
    
    
    2、Multi-group by 是hive的一个非常好的特性,请举例说明?  多组
    
    
    from A
    insert overwrite table B
     select A.a, count(distinct A.b) group by A.a
    insert overwrite table C
      select A.c, count(distinct A.b) group by A.c
      
    //from  A insert overwrite table B select A.a count(A.b) group by A.a  insert overwrite table C select A.c count(A.b)group by A.c
    
    
    
    3、写出将 text.txt 文件放入 hive 中 test 表‘2016-10-10’ 分区的语句,test 的分区字段是 l_date。
    
     
    LOAD DATA LOCAL INPATH '/your/path/test.txt' OVERWRITE INTO TABLE test PARTITION (l_date='2016-10-10')
    
    https://blog.csdn.net/haohaixingyun/article/details/52819588网页连接
    
    https://blog.csdn.net/ukakasu/article/details/47860647面试题是实例,处理大表数据。里面有原题连接。
  • 相关阅读:
    btrfs基础
    解决debian编辑器vim没有颜色的问题
    [转载]RHEL-6启动时提示:“/usr/libexec/gconf-sanity-check-2 exited with status 256”
    [转载]vim E667: Fsync failed
    Windows Server 2003修改远程桌面连接数
    [转载]"由于这台计算机没有远程桌面客户端访问许可证,远程会话被中断"的解决方案
    [转载]远程登录提示"*已经被拒绝登录到终端*"
    ESXi开启SSH的方法总结
    glibc 编译安装( version `GLIBC_2.14' not found")
    [转载]Wireshark基本介绍和学习TCP三次握手
  • 原文地址:https://www.cnblogs.com/zyde/p/9225260.html
Copyright © 2011-2022 走看看