zoukankan      html  css  js  c++  java
  • Hive| ETL清洗& 查询练习

    ETL清洗数据 

    导Jar包

    <dependencies>
            <dependency>
                <groupId>log4j</groupId>
                <artifactId>log4j</artifactId>
                <version>RELEASE</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>2.7.2</version>
            </dependency>
    
        </dependencies>

    ETLUtil.java

    public class ETLUtil {
        public static String etl(String original){
            StringBuilder stringBuilder = new StringBuilder();
            String[] fields = original.split("	");
            if (fields.length < 9){
                return null;
            }
            //日志合规
            //替换空格
            fields[3] = fields[3].replace(" ", "");
            for (int i = 0; i < fields.length - 1; i++){
                if (i == fields.length - 1){
                    stringBuilder.append(fields[i]);
    
                }else if (i < 9){
                    stringBuilder.append(fields[i]).append("	");
                }else {
                    stringBuilder.append(fields[i]).append("&");
                }
            }
            return stringBuilder.toString();
        }
    }

    ETLMapper.java

    public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        Text k = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String original = value.toString();
            String etlString = ETLUtil.etl(original);
            if (StringUtils.isNotEmpty(etlString)){
                k.set(etlString);
                context.write(k, NullWritable.get());
                context.getCounter("ETL", "True").increment(1);
    
            }else {
                context.getCounter("ETL", "False").increment(1);
            }
        }
    }

    ETLDriver.java

    public class ETLDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Job job = Job.getInstance(new Configuration());
    
            job.setJarByClass(ETLDriver.class);
            job.setMapperClass(ETLMapper.class);
            job.setNumReduceTasks(0);
    
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
    
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            boolean b = job.waitForCompletion(true);
            System.exit(b ? 0 : 1);
        }
    }
    [kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir -p /guli/user
    [kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/video
    [kris@hadoop102 hadoop-2.7.2]$ hadoop fs -mkdir /guli/etl
    [kris@hadoop102 datas]$ hadoop fs -moveFromLocal user.txt /guli/user
    [kris@hadoop102 datas]$ hadoop fs -moveFromLocal *.txt /guli/video
    [kris@hadoop102 hadoop-2.7.2]$ hadoop jar ETLVideo.jar com.atguigu.etl.ETLDriver /guli/video /guli/video_etl
     ETL
                    False=5792
                    True=743569
    创建表:
    create external table gulivideo_ori(
        videoId string, 
        uploader string, 
        age int, 
        category array<string>, 
        length int, 
        views int, 
        rate float, 
        ratings int, 
        comments int,
        relatedId array<string>)
    row format delimited 
    fields terminated by "	"
    collection items terminated by "&"
    stored as textfile
    location '/guli/video_etl';
    
    create external table gulivideo_user_ori(
        uploader string,
        videos int,
        friends int)
    row format delimited 
    fields terminated by "	" 
    stored as textfile
    location '/guli/user';
    
    
    create table gulivideo_orc(
        videoId string, 
        uploader string, 
        age int, 
        category array<string>, 
        length int, 
        views int, 
        rate float, 
        ratings int, 
        comments int,
        relatedId array<string>)
    row format delimited fields terminated by "	" 
    collection items terminated by "&" 
    stored as orc;
    
    create table gulivideo_user_orc(
        uploader string,
        videos int,
        friends int)
    row format delimited 
    fields terminated by "	" 
    stored as orc;
    
    0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_orc select * from gulivideo_ori;
    0: jdbc:hive2://hadoop101:10000> insert into table gulivideo_user_orc select * from gulivideo_user_ori;
                    
    1.--统计视频观看数Top10
    select videoid, uploader, views from gulivideo_orc
    order by views desc limit 10;
    +--------------+------------------+-----------+--+
    | videoid | uploader | views |
    +--------------+------------------+-----------+--+
    | dMH0bHeiRNg | judsonlaipply | 42513417 |
    | 0XxI-hvPRRA | smosh | 20282464 |
    | 1dmVU08zVpA | NBC | 16087899 |
    | RB-wUgnyGv0 | ChrisInScotland | 15712924 |
    | QjA5faZF1A8 | guitar90 | 15256922 |
    | -_CSo1gOd48 | tasha | 13199833 |
    | 49IDp76kjPw | TexMachina | 11970018 |
    | tYnn51C3X_w | CowSayingMoo | 11823701 |
    | pv5zWaTEVkI | OkGo | 11672017 |
    | D2kJZOfq7zk | mrWoot | 11184051 |
    +--------------+------------------+-----------+--+
    10 rows selected (22.612 seconds)
    使用group by的两个要素:
    (1) 出现在select后面的字段 要么是是聚合函数中的,要么就是group by 中的.
    (2) 要筛选结果 可以先使用where 再用group by 或者先用group by 再用having
    


    --2.统计视频类别热度Top10 (类别的videoid--视频的唯一id越多就代表热度高, 类别排序的多少排序;不能分组分组是在组内排序) ①统计视频类别: select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories ②按类别的热度排名 select t1.videoid, t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1 group by t1.categories order by num desc limit 10; --->拼一块:t1.videoid不能出现在select后边, select t1.categories, count(videoid) num from (select videoid, categories from gulivideo_orc lateral view explode(category) tbl as categories) t1 group by t1.categories order by num desc limit 10; +----------------+---------+--+ | t1.categories | num | +----------------+---------+--+ | Music | 179049 | | Entertainment | 127674 | | Comedy | 87818 | | Animation | 73293 | | Film | 73293 | | Sports | 67329 | | Gadgets | 59817 | | Games | 59817 | | Blogs | 48890 | | People | 48890 | +----------------+---------+--+ 10 rows selected (70.01 seconds)
    3.--统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数  //所有类别中包含Top20视频的个数
    
    //Expression not in GROUP BY key 'videoid'
    not in GROUP BY key 'views',后边有views,select后必须加views
    ############
    ①观看数最高的20个视频:
    select videoid, category, views from gulivideo_orc order by views desc limit 20
    ②把类别category炸开--所属类别
    select videoid, categories, views from t1 lateral view explode(category) tbl categories 
    --->前两句合起:
    select t1.videoid, categories, t1.views from (select videoid, category, views from gulivideo_orc order by views desc limit 20
    ) t1 lateral view explode(category) tbl as categories;
    +--------------+----------------+-----------+--+
    |  t1.videoid  |   categories   | t1.views  |
    +--------------+----------------+-----------+--+
    | dMH0bHeiRNg  | Comedy         | 42513417  |
    | 0XxI-hvPRRA  | Comedy         | 20282464  |
    | 1dmVU08zVpA  | Entertainment  | 16087899  |
    | RB-wUgnyGv0  | Entertainment  | 15712924  |
    | QjA5faZF1A8  | Music          | 15256922  |
    | -_CSo1gOd48  | People         | 13199833  |
    | -_CSo1gOd48  | Blogs          | 13199833  |
    | 49IDp76kjPw  | Comedy         | 11970018  |
    | tYnn51C3X_w  | Music          | 11823701  |
    | pv5zWaTEVkI  | Music          | 11672017  |
    | D2kJZOfq7zk  | People         | 11184051  |
    | D2kJZOfq7zk  | Blogs          | 11184051  |
    | vr3x_RRJdd4  | Entertainment  | 10786529  |
    | lsO6D1rwrKc  | Entertainment  | 10334975  |
    | 5P6UU6m3cqk  | Comedy         | 10107491  |
    | 8bbTtPL1jRs  | Music          | 9579911   |
    | _BuRwH59oAo  | Comedy         | 9566609   |
    | aRNzWyD7C9o  | UNA            | 8825788   |
    | UMf40daefsI  | Music          | 7533070   |
    | ixsZy2425eY  | Entertainment  | 7456875   |
    | MNxwAU_xAMk  | Comedy         | 7066676   |
    | RUCZJVJ_M8o  | Entertainment  | 6952767   |
    +--------------+----------------+-----------+--+
    ③类别中包含top20的视频的个数:在上条基础上加上按类别分组,计数组内videoid计数
    --->
    select categories, count(videoid) from (select videoid, category, views from gulivideo_orc order by views desc limit 20
    ) t1 lateral view explode(category) tbl as categories group by categories 
    +----------------+------+--+
    |   categories   | _c1  |
    +----------------+------+--+
    | Blogs          | 2    |
    | Comedy         | 6    |
    | Entertainment  | 6    |
    | Music          | 5    |
    | People         | 2    |
    | UNA            | 1    |
    +----------------+------
    
    -- over里边不能使用limit, 怎么获取分区排序前几个呢?需要使用一个子查询;分区是数据存储上的分子文件,查询时还是在一张表
    select t1.videoid, t1.views, t1.ran, t1.categories from(
    select videoid, views, categories, rank() over(partition by categories order by views desc) ran
    from gulivideo_orc lateral view explode(category) tbl as categories) t1
    where t1.ran <= 5;
    +--------------+-----------+---------+----------------+--+
    |  t1.videoid  | t1.views  | t1.ran  | t1.categories  |
    +--------------+-----------+---------+----------------+--+
    | 2GWPOPSXGYI  | 3660009   | 1       | Animals        |
    | xmsV9R8FsDA  | 3164582   | 2       | Animals        |
    | 12PsUW-8ge4  | 3133523   | 3       | Animals        |
    | OeNggIGSKH8  | 2457750   | 4       | Animals        |
    | WofFb_eOxxA  | 2075728   | 5       | Animals        |
    | sdUUx5FdySs  | 5840839   | 1       | Animation      |
    | 6B26asyGKDo  | 5147533   | 2       | Animation      |
    | H20dhY01Xjk  | 3772116   | 3       | Animation      |
    | 55YYaJIrmzo  | 3356163   | 4       | Animation      |
    | JzqumbhfxRo  | 3230774   | 5       | Animation      |
    | RjrEQaG5jPM  | 2803140   | 1       | Autos    
    ......
    4.--统计视频观看数Top50所关联视频的所属类别排序
    Top50---relatedid---种类---; 炸开之后直接join,因它是张虚拟表,hive是不支持的
    select videoid, views, relatedid from gulivideo_orc order by views desc limit 50
    炸开单独写一个sql: t1 select distinct(tbl.relatedids) rid from t1 lateral view explode(relatedid) tbl as relatedids
    自己join自己下: t2 select g.videoid, g.category from t2 left join gulivideo_orc g on t2.vid=g.videoid
    把category炸开并排序:select cateegories, count(videoid) hot from t3 lateral view explode(category) tb12 as catogories group by categores order by hot desc;

    select categories, count(videoid) hot from(select g.videoid, g.category from(select distinct(tbl.relatedids) rid from(select videoid, views, relatedid from gulivideo_orc order by views desc limit 50) t1 lateral view explode(relatedid) tbl as relatedids) t2 join gulivideo_orc g on t2.rid=g.videoid) t3 lateral view explode(category) tbl2 as categories group by categories order by hot desc; +----------------+------+--+ | categories | hot | +----------------+------+--+ | Comedy | 217 | | Entertainment | 207 | | Music | 186 | | Blogs | 49 | | People | 49 | | Film | 46 | | Animation | 46 | | News | 21 | | Politics | 21 | | Games | 19 | | Gadgets | 19 | | Sports | 17 | | Places | 12 | | UNA | 12 | | Travel | 12 | | Howto | 12 | | DIY | 12 | | Animals | 11 | | Pets | 11 | | Autos | 3 | | Vehicles | 3 | +----------------+------+--+ 21 rows selected (115.239 seconds)
    5.--统计每个类别中的视频热度Top10,以Music为例
    创建类别表:
    create table gulivideo_category(
    videoid string, uploader string, age int, categoryid string, length int, views int, rate float,
    ratings int, comments int, relatedid array<string>)
    row format delimited fields terminated by "	"
    collection items terminated by "&"
    stored as orc;
    插入数据:
    insert into table gulivideo_category
    select videoid, uploader, age, categoryid, length, views, rate, ratings, comments, relatedid
    from gulivideo_orc lateral view explode(category) category as categoryid;
    --->把一张表全查出来:
    select categoryid, videoid, paiming from (
    select categoryid, videoid, rank() over(partition by categoryid order by views desc) paiming from gulivideo_category) t1
    where t1.paiming <= 10;
    select categoryid, videoid, views from gulivideo_category where categoryid="music" order by views desc limit 10; 6.--统计每个类别中视频流量Top10,以Music为例 select videoid, ratings from gulivideo_category where categoryid="music" order by ratings desc limit 10; 7.--统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频 ①上传视频最多的用户Top10: select videos,uploader from gulivideo_user_orc order by videos desc limit 10; ②找出这10个人上传的视频
    select g.videoid, rank() over(partition by g.uploader order by g.views desc) hot from t1 join gulivideo_orc g on t1.uploader = g.uploader
    ③找出前20
    select t2.uploader, t2.videoid from t2 where t2.hot <= 20;
    select t2.uploader, t2.videoid from( select g.uploader, g.videoid, g.views, rank() over(partition by g.uploader order by g.views desc) hot from (select uploader,videos from gulivideo_user_orc order by videos desc limit 10) t1 left join gulivideo_orc g on t1.uploader=g.uploader) t2 where t2.hot <= 20; +----------------+--------------+--+ | t2.uploader | t2.videoid | +----------------+--------------+--+ | NULL | NULL | | NULL | NULL | | NULL | NULL | | NULL | NULL | | Ruchaneewan | xbYyjUdhtJw | | Ruchaneewan | 4dkKeIUkN7E | | Ruchaneewan | qCfuQA6N4K0 | | Ruchaneewan | TmYbGQaRcNM | | Ruchaneewan | dOlfPsFSjw0 | | expertvillage | -IxHBW0YpZw | | expertvillage | BU-fT5XI_8I | | expertvillage | ADOcaBYbMl0 | ... 8.--统计每个类别视频观看数Top10 select t.categoryid, t.videoid, t.ranking from( select categoryid, videoid, rank() over(partition by categoryid order by views desc) ranking from gulivideo_category) t where t.ranking <= 10; +----------------+--------------+------------+--+ | t.categoryid | t.videoid | t.ranking | +----------------+--------------+------------+--+ | Animals | 2GWPOPSXGYI | 1 | | Animals | xmsV9R8FsDA | 2 | | Animals | 12PsUW-8ge4 | 3 | | Animals | OeNggIGSKH8 | 4 | | Animals | WofFb_eOxxA | 5 | | Animals | AgEmZ39EtFk | 6 | | Animals | a-gW3RbJd8U | 7 | | Animals | 8CL2hetqpfg | 8 | | Animals | QmroaYVD_so | 9 | | Animals | Sg9x5mUjbH8 | 10 | | Animation | sdUUx5FdySs | 1 | | Animation | 6B26asyGKDo | 2 | | Animation | H20dhY01Xjk | 3 | | Animation | 55YYaJIrmzo | 4 | | Animation | JzqumbhfxRo | 5 | | Animation | eAhfZUZiwSE | 6 | | Animation | h7svw0m-wO0 | 7 | | Animation | tAq3hWBlalU | 8 | | Animation | AJzU3NjDikY | 9 | | Animation | ElrldD02if0 | 10 | | Autos | RjrEQaG5jPM | 1 | ...... 210 rows selected (24.379 seconds)

    1.分组TOPN选出今年每个学校,每个年级,每个科目分数前三.

    : 时间,学校,年级,姓名,科目,成绩

    建表

    create external table score_test(school string, grade string, name string, subject string, score int)
    partitioned by (year string)
    row format delimited fields terminated by ','
    stored as textfile
    location '/hive_data';
    stored as textfile  ##把它放后边报错
    View Code
    select t1.name, t1.subject, t1.ran from(select name, subject, row_number() over(partition by school, grade, subject order by score desc) ran
    from score_test where year="2013") t1 where t1.ran <= 3;

    2. 今年 清华 1年级 总成绩大于200分的学生 以及学生数 ||多个字段的group by,还要按name分

    select school, grade, name, sum(score) sum_score, count(1) over() num from score_test
    where year = "2013" and school="清华" and grade="1"
    group by school, grade, name having sum_score > 200;
    3. 
    CREATE TABLE transaction_details (cust_id INT, amount FLOAT, month STRING, country STRING) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ ; 
    建按月份的分区表:
    create table transaction_details(cust_id int, amount float, month string, country string)
    partitioned by(month string)
    row format delimited fields terminated by ',';
    
    每个月的总收入:
    select cust_id, sum(amount) over(partition by month) as total from transaction_details;
    
    4. 将内部表a,转换成外部表:
    alter tale a set tblproperties ('external'='true');
    
    5.订单详情表ord_det(order_id订单号,sku_id商品编号,sale_qtty销售数量,dt日期分区)
    任务计算2016年1月1日商品销量的Top100,并按销量降级排序
    select order_id, sale_qtty from ord_det 
    where dt = "20160101" order by sale_qtty desc limit 100; 
    STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:
    数据样例:2017-01-01,10029028,1000003251,33.57
    1) 给出2017年每个月的订单数、用户数、总成交金额
    一个分区中的数据肯定很大,不要用distinct,用group by user_id做一个子查询再count(user_id)
    select count(user_id) from (select user_id from stg.order group by user_id);
    select count(order_id) order_count, count(distinct(user_id)) user_count, sum(amount) all, substring(date, 1, 7) month from stg.order 
    where substring(date, 1, 4)='2017' 
    group by month;
    
    2) 给出2017年11月的新客数(指在11月才有第一笔订单)。
    select count(1) from
    (select order_id, lag(date, 1) over(partition by user_id order by date) fistOrder from stg.order) t1
    where substring(date, 1, 7) = '2017-11' and fistOrder is null;
    蚂蚁森林植物申领统计
    ===================================================================
    表1:user_low_carbon表记录了用户每天的蚂蚁森林低碳生活领取的记录流水
    user_idint)   data_dt(string)     low_carbon
    用户             日期                  减少碳排放(g)
    
    数据样例:
    user_id  |   date_dt   |  low_carbon
    ————————————————————
    u_001    |   2017/1/1  |  10
    u_001    |   2017/1/2  |  150
    u_001    |   2017/1/2  |  110
    u_001    |   2017/1/2  |  10
    u_001    |   2017/1/4  |  50
    u_001    |   2017/1/4  |  10
    u_001    |   2017/1/6  |  45
    u_001    |   2017/1/6  |  90
    u_002    |   2017/1/1  |  10
    u_002    |   2017/1/2  |  150
    u_002    |   2017/1/2  |  70
    u_002    |   2017/1/3  |  30
    u_002    |   2017/1/3  |  80
    u_002    |   2017/1/4  |  150
    u_002    |   2017/1/5  |  101
    u_002    |   2017/1/6  |  68
    
    ================================================================
    表2:plant_carbon表,用于记录申领环保植物所需要减少的碳排放量
      
    plant_id(int)    plant_name    low_carbon
    植物编号            植物名        换购植物所需要的碳
    
    数据样例:
    plant_id  |  plant_name  |  plant_carbon
    ————————————————————
    p001      |  梭梭树      |  17
    p002      |  沙柳        |  19
    p003      |  樟子树      |  146
    p004      |  胡杨        |  215
    ================================================================
    题目一
    蚂蚁森林植物申领统计
    问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗“p004-胡杨”,
    剩余的能量全部用来领取“p002-沙柳”。
    统计在10月1日累计申领“p002-沙柳” 排名前10的用户信息;以及他比后一名多领了几颗沙柳。
    得到的统计结果如下表样式:
    user_id  plant_count less_count(比后一名多领了几颗沙柳)
    u_101    1000         100
    u_088    900          400
    u_103    5001.累计能量排名前10的用户信息,取日期在10月1日之前的|按用户id分组|总能量排序|-过滤前11:
    t1
    select user_id, sum(low_carbon) sum_carbon from user_low_carbon 
    where datediff(regexp_replace("2017/10/1", "/", "-"), regexp_replace(date_dt, "/", "-")) > 0
    group by user_id order by sum_carbon desc limit 11;
    2.胡杨的能量
    t2
    select plant_carbon huyang from plant_carbon where plant_name = "胡杨";
    3.杨柳的能量
    t3
    select plant_carbon yangliu from plant_carbon where plant_name = "杨柳";
    4.能领取的杨柳个数num
    t4
    select floor((sum_carbon-huyang)/yangliu) plant_count from t1, t2, t3;
    
    5.他比后一名的人多领取的数,用lead往后第n行数据,把它们做比较的放在同1行;
    t5
    select lead(sum_carbon, 1, 0) over(sort by sum_carbon desc) plant_count2 from t4;
    
    6.做比较
    select user_id,plant_count, (plant_count - plant_count2) less_count from t5 limit10;
    
    
    =================================================================
    题目二
    蚂蚁森林低碳用户排名分析
    问题:查询user_low_carbon表中每日流水记录,条件为:
    用户在2017年,连续三天(或以上)的天数里,
    每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
    需要查询返回满足以上条件的user_low_carbon表中的记录流水。
    例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g:
    seq(keyuser_id data_dt  low_carbon
    xxxxx10    u_002  2017/1/2  150
    xxxxx11    u_002  2017/1/2  70
    xxxxx12    u_002  2017/1/3  30
    xxxxx13    u_002  2017/1/3  80
    xxxxx14    u_002  2017/1/4  150
    xxxxx14    u_002  2017/1/5  101
    
    
    1.过滤用户在2007年中,碳排放量超过100g能量的用户;
    按在2007年的| 用户id、日期(因为不同行有可能是一个日期)进行分组| 选择每天的能量>100的;
    t1
    select user_id, date_dt, sum(low_carbon) sum_day from user_low_carbon where substring(data_dt, 1, 4) year = "2017"
    group by user_id, data_dt order by user_id, data_dt having sum_day > 100
    
    2.每条数据的日期以及前两条和后两条数据的日期
    t2
    select user_id,
    data_dt, 
    lag(data_dt, 2, "2000/1/1") over(partition by user_id) lag_2,
    lag(data_dt, 1, "2000/1/1") over(partition by user_id) lag_1,
    lead(data_dt, 2, "2000/1/1") over(partition by user_id) lead_2,
    lead(data_dt, 1, "2000/1/1") over(partition by user_id) lead_1
    from t1;
    3.计算当前日期与前后两条数据的日期差
    t3:
    select user_id, data_dt
    datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_2", "/", "-")) lag2,
    datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lag_1", "/", "-")) lag1,
    datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_2"), "/", "-") lead2,
    datediff(regexp_replace("data_dt", "/", "-"), regexp_replace("lead_1", "/", "-")) lead1
    from t2;
     
    4.连续3天有三种情况:
        ①当前日期和前一天日期差为1,当前日期和前两天的日期差为2;
        ②当前日期和前一天日期差为1,当前日期和后一天的日期差为-1;
        ③当前日期和后一天的日期差为-1,当前日期和后两天的日期差为-2;
    t4:    
    select user_id, data_dt from t3
    where (lag1=1 and lag2=2) or (lag1=1 and lead1=-1) or (lead1=-1 and lead2=-2);
    
    5.最后的结果
    select t5.user_id, t5.data_dt, t5.low_carbon from user_low_carbon t5 
    inner join t4 on t4.user_id = t5.user_id 
    where t4.user_id = t5.user_id and t4.data_dt = t5.date_dt;
    
    
    ======================================================
    注:
    涉及到的hive函数
    ====================================
    1:regexp_replace(arg1,arg2,arg3)
        arg1:被替换字符串的正则表达式
        arg2:被替换的字符
        arg3:被换成的字符
    e.g. :regexp_replace("2017/1/4","/","-")=2017-1-4
    =====================================
    2:datediff(arg1,arg2)
        arg1:日期1
        arg2:日期2
    e.g.:datediff("2017-1-6","2017-1-5")=1
  • 相关阅读:
    pyinstall打包工具
    python中用xlsxwriter创建图表
    python打包工具 cx_Freeze介绍
    JavaScript、Dom和jQuery
    MHA原理
    mysql-mmm实现高可用和部署时须要考虑的问题
    jquery中attr和prop的区别
    FLUSH TABLES WITH READ LOCK 获取锁的速度
    Python Memcached、Redis & RabbitMQ使用
    innodb_flush_log_at_trx_commit
  • 原文地址:https://www.cnblogs.com/shengyang17/p/10404223.html
Copyright © 2011-2022 走看看