zoukankan      html  css  js  c++  java
  • hive笔记

    hive

    -- 不区分大小写
    --建表  if not exists 
    CREATE TABLE if not exists page_view(viewTime INT, userid BIGINT,
                    page_url STRING, referrer_url STRING,
                    friends ARRAY<BIGINT>, properties MAP<STRING, STRING>,
                    ip STRING COMMENT 'IP Address of the User')
    COMMENT 'This is the page view table'
    PARTITIONED BY(dt STRING, country STRING)	--分区
    CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS	--分桶
    ROW FORMAT DELIMITED	--行默认分隔符
            FIELDS TERMINATED BY '1'	--字段分隔符
            COLLECTION ITEMS TERMINATED BY '2'
            MAP KEYS TERMINATED BY '3'
    STORED AS SEQUENCEFILE;	--以二进制格式进行存储数据在hdfs上
    
    
    SHOW TABLES;
    SHOW TABLES 'page.*';
    SHOW PARTITIONS page_view;  --列出表的分区,无分区则抛出错误。
    
    DESCRIBE page_view;  --列出表的列和列的类型
    DESCRIBE EXTENDED page_view;  --列出表的列和表的其他属性。这会打印很多信息,且输出的风格不是很友好,通常用于调试。
    DESCRIBE EXTENDED page_view PARTITION (ds='2016-08-08');   --列出列和分区的所有属性。这也会打印出许多信息,通常也是用于调试。
    
    ALTER TABLE old_table_name RENAME TO new_table_name;  --修改表名
    ALTER TABLE old_table_name REPLACE COLUMNS (col1 TYPE, ...);  --修改列名(列类型需一致),不修改的列需附上,否则丢失
    ALTER TABLE tab1 ADD COLUMNS (c1 INT COMMENT 'a new int column', c2 STRING DEFAULT 'def val');  --增加列
    
    DROP TABLE if exists pv_users;  --删表
    ALTER TABLE pv_users DROP PARTITION (ds='2016-08-08')  --删除分区
    
    --加载数据到Hive表 -> 外部表
    CREATE EXTERNAL TABLE page_view_stg(viewTime INT, userid BIGINT,
                    page_url STRING, referrer_url STRING,
                    ip STRING COMMENT 'IP Address of the User',
                    country STRING COMMENT 'country of origination')
    COMMENT 'This is the staging page view table'
    ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY '44' 	--‘44’是逗号,‘12’是换页符
    LINES TERMINATED BY '12'
    STORED AS TEXTFILE
    LOCATION '/user/data/staging/page_view';
    
    hadoop dfs -put /tmp/pv_2016-06-08.txt /user/data/staging/page_view
    
    FROM page_view_stg pvs
    INSERT OVERWRITE TABLE page_view PARTITION(dt='2016-06-08', country='US')
    SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip
    WHERE pvs.country = 'US';
    
    
    --加载数据到Hive表 -> 直接(格式一致),路径参数可以是一个目录(不含子目录),一个文件,或一个通配符
    LOAD DATA LOCAL INPATH /tmp/pv_2016-06-08_us.txt INTO TABLE page_view PARTITION(date='2016-06-08', country='US')
    
    
    --select
    INSERT OVERWRITE TABLE user_active
    SELECT user.*
    FROM user
    WHERE user.active = 1;
    
    
    INSERT OVERWRITE TABLE xyz_com_page_views
    SELECT page_views.*
    FROM page_views
    WHERE page_views.date >= '2008-03-01' AND page_views.date <= '2008-03-31' AND
          page_views.referrer_url like '%xyz.com';
    
    --join
    INSERT OVERWRITE TABLE pv_users
    SELECT pv.*, u.gender, u.age
    FROM user u JOIN page_view pv 
    ON (pv.userid = u.id)
    WHERE pv.date = '2008-03-03';
    
    
    INSERT OVERWRITE TABLE pv_users
    SELECT pv.*, u.gender, u.age
    FROM user u FULL OUTER JOIN page_view pv 
    ON (pv.userid = u.id)
    WHERE pv.date = '2008-03-03';
    
    
    INSERT OVERWRITE TABLE pv_friends
    SELECT pv.*, u.gender, u.age, f.friends
    FROM page_view pv JOIN user u ON (pv.userid = u.id) JOIN friend_list f ON (u.id = f.uid)
    WHERE pv.date = '2008-03-03';
    
    
    --union all
    INSERT OVERWRITE TABLE actions_users
    SELECT u.id, actions.date
    FROM (
        SELECT av.uid AS uid
        FROM action_video av
        WHERE av.date = '2008-06-03'
    
        UNION ALL
    
        SELECT ac.uid AS uid
        FROM action_comment ac
        WHERE ac.date = '2008-06-03'
    ) actions JOIN users u 
    ON(u.id = actions.uid);
    
    
    --聚合(不可distinct作用于不同的列)
    INSERT OVERWRITE TABLE pv_gender_agg
    SELECT pv_users.gender, count(DISTINCT pv_users.userid), count(*), sum(DISTINCT pv_users.userid)
    FROM pv_users
    GROUP BY pv_users.gender;
    
    
    --多表/文件插入
    FROM pv_users
    INSERT OVERWRITE TABLE pv_gender_sum  --插入hive
        SELECT pv_users.gender, count_distinct(pv_users.userid)
        GROUP BY pv_users.gender
    
    INSERT OVERWRITE DIRECTORY '/user/data/tmp/pv_age_sum'  --插入HDFS文件系统
        SELECT pv_users.age, count_distinct(pv_users.userid)
        GROUP BY pv_users.age;
    
    
    --动态分区插入
    FROM page_view_stg pvs  --不便于维护
    INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='US')
           SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'US'
    INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='CA')
           SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'CA'
    INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country='UK')
           SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip WHERE pvs.country = 'UK';
    
    FROM page_view_stg pvs  --多分区插入,目前只允许最后一列放置动态分区列,层级次序
    INSERT OVERWRITE TABLE page_view PARTITION(dt='2008-06-08', country)
           SELECT pvs.viewTime, pvs.userid, pvs.page_url, pvs.referrer_url, null, null, pvs.ip, pvs.country
    
    
    --导出到本地文件
    INSERT OVERWRITE LOCAL DIRECTORY '/tmp/pv_gender_sum'
    SELECT pv_gender_sum.*
    FROM pv_gender_sum;
    
    
    --抽样
    --TABLESAMPLE(BUCKET 3 OUT OF 16) 表示第3个桶和第19个桶
    --TABLESAMPLE(BUCKET 3 OUT OF 64 ON userid) 表示第3个桶的一半
    INSERT OVERWRITE TABLE pv_gender_sum_sample
    SELECT pv_gender_sum.*
    FROM pv_gender_sum TABLESAMPLE(BUCKET 3 OUT OF 32);  --32个桶中选择第三个,必须是CLUSTERED BY修饰的列
    
    
    --表的array、map列操作
    CREATE TABLE array_table (int_array_column ARRAY<INT>);
    SELECT pv.userid, pv.friends[2], size(pv.friends)
    FROM page_view pv;
    
    INSERT OVERWRITE page_views_map
    SELECT pv.userid, pv.properties['page type'], size(pv.properties)
    FROM page_views pv;
    

      

  • 相关阅读:
    和至少为 K 的最短子数组
    使用VS code编写C++无法实时检测代码的解决办法
    anaconda安装VSCODE后,python报错
    神经网络中sigmod函数和tanh函数的区别
    获取本机IP
    windows C++捕获CMD命令输出
    windows下面生成 dump
    windows 控制台命令输出 捕获
    不使用PEM 文件建立SSL通道
    OpenSSL socket 服务端
  • 原文地址:https://www.cnblogs.com/iupoint/p/14022746.html
Copyright © 2011-2022 走看看