zoukankan      html  css  js  c++  java
  • sqoop&Hive实例

    案例:
    1. 把原始log数据加载到表beifenglog中;
    2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;
    3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;
     
     
    1. 把原始log数据加载到表beifenglog中;

    create table beifenglog(
    remote_addr string,
    remote_user string,
    time_local string,
    request string,
    status string,
    body_bytes_sent string,
    request_body string,
    http_referer string,
    http_user_agent string,
    http_x_forwarded_for string,
    host string)
    row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
    with serdeproperties(
    "input.regex" = "(\"[\d\.]+\") (\"[^ ]+\") (\".*?\") (\".*?\") (\"\d+\") (\"\d+\") ([^ ]+) (\"[^ ]+\") (\".*?\") (\"[^ ]+\") (\"[^ ]+\")"
    )
    stored as textfile; 

    加载原表数据
    load data local inpath '/opt/test/beifenglog.data' overwrite into table beifenglog;
     
    2. 建立子表beifenglog_hour_visit,存贮常用字段,并从原表中提取数据到子表中;

    create table beifenglog_hour_visit(
    remote_addr string,
    time_local string,
    request string,
    http_referer string)
    row format delimited fields terminated by '	'
    stored as orc tblproperties ("orc.compression"="snappy");
     
    insert overwrite table beifenglog_hour_visit select remote_addr,time_local,request,http_referer from beifenglog ; 

     
    报错1:
    Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.ClassNotFoundException: Class org.apache.hadoop.hive.contrib.serde2.RegexSerDe not found
            at org.apache.hadoop.hive.ql.exec.MapOperator.getConvertedOI(MapOperator.java:334)
            at org.apache.hadoop.hive.ql.exec.MapOperator.setChildren(MapOperator.java:352)
            at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.configure(ExecMapper.java:126)
     
    解决: hive-site.xml

    <property>
        <name>hive.aux.jars.path</name>
        <value>file:///opt/modules/cdh/hive-0.13.1-cdh5.3.6/lib/hive-contrib-0.13.1-cdh5.3.6.jar</value>
        <description>Added by tiger.zeng on 20120202.These JAR file are available to all users for all jobs</description>
    </property> 

     
    报错2:
    Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.IllegalArgumentException: No enum constant org.apache.hadoop.hive.ql.io.orc.CompressionKind.snappy
            at org.apache.hadoop.hive.ql.exec.FileSinkOperator.createBucketFiles(FileSinkOperator.java:469)
            at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:550)
            at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
            at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84)
            at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
            at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:92)
            at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:796)
            at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:549)
            ... 9 more
            
    解决:重新创建子表
    ("orc.compress"="snappy") --> ("orc.compression"="snappy")
     
     
    3. 提取原表数据过程中,使用UDF处理字段信息,存储到子表中;

    add jar /opt/test/hive.jar;
    create temporary function removequote as 'org.gh.hadoop.hive.test.RemoveQuote';
    create temporary function formatdate as 'org.gh.hadoop.hive.test.FormatDate';
     
    insert overwrite table beifenglog_hour_visit select removequote(remote_addr),formatdate(removequote(time_local)),removequote(request),removequote(http_referer) from beifenglog ; 

    FormatData.java内容如下:

    import java.text.SimpleDateFormat;
    import java.util.Date;
    import java.util.Locale;
    import org.apache.hadoop.hive.ql.exec.UDF;
     
    public class FormatDate extends UDF{
        private SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
        private SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        public String evaluate(String dateStr){
            if(dateStr == null){
                return null;
            }
             
            Date date = null;
            String ret = null;
            try{
                date = inputFormat.parse(dateStr);
                ret = outputFormat.format(date);
            }catch(Exception e){
                e.printStackTrace();
                return null;
            }
            return ret;
        }
    } 

     
    RemoveQuote.java内容如下:

    package org.gh.hadoop.hive.test;
    import org.apache.hadoop.hive.ql.exec.UDF;
    public class RemoveQuote extends UDF{
        public String evaluate(String str){
            if(str == null){
                return null;
            }
            return str.replaceAll(""", "");
        }
    } 
  • 相关阅读:
    [MCM] PSO粒子群算法解决TSP问题
    [MCM] 2017研究生数学建模竞赛A题 3架飞机 TSP 求总路径最小
    [tool] AI视频翻译 解决英文视频字幕问题(类似youtube自动生成字幕)
    使用vsnprintf后链接错误及解决方法
    Linux Shell编程
    如何把va_list可变参数传送到下一级函数中(如传送到printf)
    ubuntu phone/touch的源码从哪里下载?
    The Native POSIX Thread Library for Linux
    Linux内核头文件与内核与库的关系
    Buildroot阅读笔记
  • 原文地址:https://www.cnblogs.com/guanhao/p/5624065.html
Copyright © 2011-2022 走看看