zoukankan      html  css  js  c++  java
  • 1.1-1.4 sqoop概述及安装cdh版hadoop

    一、概述

    Sqoop是一个在结构化数据和Hadoop之间进行批量数据迁移的工具,结构化数据可以是Mysql、Oracle等RDBMS。
    Sqoop底层用MapReduce程序实现抽取、转换、加载,MapReduce天生的特性保证了并行化和高容错率,而且
    相比Kettle等传统ETL工具,任务跑在Hadoop集群上,减少了ETL服务器资源的使用情况。在特定场景下,抽取过程会有很大的性能提升。    如果要用Sqoop,必须正确安装并配置Hadoop,因依赖于本地的hadoop环境启动MR程序;mysql、oracle等数据库的JDBC驱动也要放到Sqoop的lib目录下。
    本文针对的是Sqoop1,不涉及到Sqoop2,两者有很大区别;

    import是把数据从RDBMS导入到Hadoop的工具;


    二、安装hadoop

    1、说明

    sqoop与hadoop集成需要编译,比较麻烦,这里直接使用cdh版本的hadoop;

    CDH安装包下载:http://archive.cloudera.com/cdh5/

    使用cdh-5.3.6版本:

    http://archive.cloudera.com/cdh5/cdh/5/

           hadoop-2.5.0-cdh5.3.6.tar.gz
            hive-0.13.1-cdh5.3.6.tar.gz
           zookeeper-3.4.5-cdh5.3.6.tar.gz
           sqoop-1.4.5-cdh5.3.6.tar.gz


    2、准备安装

    #创建安装目录
    [root@hadoop-senior opt]# mkdir /opt/cdh-5.3.6
    
    #上传安装包
    [root@hadoop-senior cdh]# pwd
    /opt/softwares/cdh
    
    [root@hadoop-senior cdh]# ls
     hadoop-2.5.0-cdh5.3.6.tar.gz  hive-0.13.1-cdh5.3.6.tar.gz  sqoop-1.4.5-cdh5.3.6.tar.gz
    
    
    #解压hadoop、hive
    [root@hadoop-senior cdh]# tar zxf hadoop-2.5.0-cdh5.3.6.tar.gz -C /opt/cdh-5.3.6/
    [root@hadoop-senior cdh]# tar zxf hive-0.13.1-cdh5.3.6.tar.gz -C /opt/cdh-5.3.6/
    
    
    #查看
    [root@hadoop-senior cdh-5.3.6]# cd /opt/cdh-5.3.6/
    
    [root@hadoop-senior cdh-5.3.6]# ls
    hadoop-2.5.0-cdh5.3.6  hive-0.13.1-cdh5.3.6
    
    [root@hadoop-senior cdh-5.3.6]# cd hadoop-2.5.0-cdh5.3.6/
    
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# ls
    bin  bin-mapreduce1  cloudera  etc  examples  examples-mapreduce1  include  lib  libexec  sbin  share  src


    3、配置

    java_home:

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/hadoop-env.sh
    export JAVA_HOME=/opt/modules/jdk1.7.0_80
    
    
    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/yarn-env.sh
    export JAVA_HOME=/opt/modules/jdk1.7.0_80
    
    
    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/mapred-env.sh
    export JAVA_HOME=/opt/modules/jdk1.7.0_80


    core-site.xml

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/core-site.xml
    <configuration>
    
        <property>
            <name>fs.defaultFS</name>
            <value>hdfs://hadoop-senior.ibeifeng.com:8020</value>
        </property>
    
        <property>
            <name>hadoop.tmp.dir</name>
            <value>/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/data/tmp</value>
         </property>
    
    </configuration>
    
    
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# mkdir -pv /opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/data/tmp


    hdfs-site.xml

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/hdfs-site.xml
    <configuration>
        <property>
            <name>dfs.namenode.secondary.http-address</name>
            <value>hadoop-senior.ibeifeng.com:50090</value>
        </property>
        
        <property>
            <name>dfs.namenode.http-address</name>
            <value>hadoop-senior.ibeifeng.com:50070</value>
        </property>
        
        <property>
            <name>dfs.replication</name>
            <value>1</value>
        </property>
    
        <property>
            <name>dfs.permissions</name>
            <value>false</value>
        </property>
    
    </configuration>


    slaves文件

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/slaves
    hadoop-senior.ibeifeng.com


    yarn-site.xml

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/yarn-site.xml
    <configuration>
    
        <property>
            <name>yarn.nodemanager.aux-services</name>
            <value>mapreduce_shuffle</value>
        </property>
    
        <property>
            <name>yarn.resourcemanager.hostname</name>
            <value>hadoop-senior.ibeifeng.com</value>
        </property>
    
        <property>
            <name>yarn.nodemanager.resource.cpu-vcores</name>
            <value>4</value>
        </property>
        
        <property>
            <name>yarn.log-aggregation-enable</name>
            <value>true</value>
        </property>
        
        <property>
            <name>yarn.log-aggregation.retain-seconds</name>
            <value>604800</value>
        </property>
    
    </configuration>


    mapred-site.xml

    ##/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6/etc/hadoop/mapred-site.xml         //先重命名
    <configuration>
    
        <property>
            <name>mapreduce.framework.name</name>
            <value>yarn</value>
        </property>
    
        <property>
            <name>mapreduce.jobhistory.address</name>
            <value>hadoop-senior.ibeifeng.com:10020</value>
        </property>
    
        <property>
            <name>mapreduce.jobhistory.webapp.address</name>
            <value>hadoop-senior.ibeifeng.com:19888</value>
        </property>
    
    </configuration>


    3、启动

    #格式化文件系统
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# bin/hdfs namenode -format
    
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# rm -rf /tmp/*
    
    
    #启动hdfs
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# sbin/hadoop-daemon.sh start namenode
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# sbin/hadoop-daemon.sh start datanode
    
    
    #启动yarn
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# sbin/yarn-daemon.sh start resourcemanager
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# sbin/yarn-daemon.sh start nodemanager
    
    
    #启动historyserver
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# sbin/mr-jobhistory-daemon.sh start historyserver
    
    
    
    #查看
    [root@hadoop-senior ~]# jps
    3165 Jps
    3119 JobHistoryServer
    2683 ResourceManager
    2573 DataNode
    2471 NameNode
    2960 NodeManager


    三、安装hive

    1、/opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/conf/hive-env.sh           //先重命名

    HADOOP_HOME=/opt/cdh-5.3.6/hadoop-2.5.0-cdh5.3.6
    
    export HIVE_CONF_DIR=/opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/conf


    2、/opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/conf/hive-log4j.properties     //先重命名

    hive.log.dir=/opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/logs


    3、创建配置文件

    [root@hadoop-senior ~]# cd /opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/conf/
    
    [root@hadoop-senior conf]# touch hive-site.xml
    
    ################hive-site.xml#######################
    <?xml version="1.0"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <configuration>
    
        <property>
            <name>javax.jdo.option.ConnectionURL</name>
            <value>jdbc:mysql://hadoop-senior.ibeifeng.com:3306/metadata?createDatabaseIfNotExist=true</value>
            <description>JDBC connect string for a JDBC metastore</description>
        </property>
    
        <property>
            <name>javax.jdo.option.ConnectionDriverName</name>
            <value>com.mysql.jdbc.Driver</value>
            <description>Driver class name for a JDBC metastore</description>
        </property>
    
        <property>
            <name>javax.jdo.option.ConnectionUserName</name>
            <value>root</value>
            <description>username to use against metastore database</description>
        </property>
    
        <property>
            <name>javax.jdo.option.ConnectionPassword</name>
            <value>123456</value>
            <description>password to use against metastore database</description>
        </property>
        
        <property>
            <name>hive.cli.print.header</name>
            <value>true</value>
            <description>Whether to print the names of the columns in query output.</description>
        </property>
    
        <property>
            <name>hive.cli.print.current.db</name>
            <value>true</value>
            <description>Whether to include the current database in the Hive prompt.</description>
        </property>
        
        <property>
            <name>hive.fetch.task.conversion</name>
            <value>more</value>
            </property>
    
    </configuration>


    4、拷贝mysql驱动文件

    [root@hadoop-senior hive-0.13.1-cdh5.3.6]# cp /opt/modules/hive-0.13.1/lib/mysql-connector-java-5.1.27-bin.jar ./lib/


    5、连接hive

    ##连接测试
    [root@hadoop-senior hive-0.13.1-cdh5.3.6]# bin/hive
    Logging initialized using configuration in file:/opt/cdh-5.3.6/hive-0.13.1-cdh5.3.6/conf/hive-log4j.properties
    hive (default)>
    
    
    ##创建hive的数据存储目录
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# bin/hdfs dfs -mkdir -p /user/hive/warehouse
    
    [root@hadoop-senior hadoop-2.5.0-cdh5.3.6]# bin/hdfs dfs -chmod g+w /user/hive/warehouse
    
    
    ##创建一张测试表
    hive (default)> create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '	';
    OK
    Time taken: 0.708 seconds
    
    hive (default)> load data local inpath '/opt/datas/student.txt' overwrite into table student;
    Loading data to table default.student
    Table default.student stats: [numFiles=1, numRows=0, totalSize=36, rawDataSize=0]
    OK
    Time taken: 0.885 seconds
    
    hive (default)> select * from student;
    OK
    student.id    student.name
    1001    zhangsan
    1002    lisi
    1003    wangwu
    Time taken: 0.218 seconds, Fetched: 3 row(s)

    此时web页面应该也可以打开:ip:8088     ip:50070

  • 相关阅读:
    Sbt 配置国内镜像
    spark sql 之 RDD与DataFrame互相转化
    Base64编码和解码工具
    BDB数据库操作工具化
    String、StringBuffer、StringBuilder三者的区别
    常见的数组排序方法
    drop,delete,truncate 三者的异同点
    redis 常用命令
    git shell应用
    机器学习入门之路
  • 原文地址:https://www.cnblogs.com/weiyiming007/p/10819750.html
Copyright © 2011-2022 走看看