zoukankan      html  css  js  c++  java
  • hadoop3伪分布式安装

    一、安装hadoop

    1、伪分布式模式

    有namenode,datanode,resoucrcemanager,nodemanager等进程,这些进程运行在同一台服务器上

    2、ssh免密码连接

    执行命令:ssh-keygen -t rsa 一路回车即可

    复制秘钥到本地: cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

    验证 ssh 本机ip 成功

    3、防火墙关闭

    systemctl stop firewalld.service #停止firewall

    systemctl disable firewalld.service #禁止firewall开机启动

    4、jdk安装配置

    安装方式自行百度

    验证:java -version

    5、Hadoop配置

    解压

    命令:tar cvf hadoop-3.1.2.tar -C /ecapp ; mv hadoop-3.1.2 hadoop

    二、配置文件

    1、core-site.xml

    <?xml version="1.0" encoding="UTF-8"?>

    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

    <configuration>

      <!--指定HADOOP所使用的文件系统schema(URI),HDFS的老大(NameNode)的地址-->

      <property>

        <name>fs.defaultFS</name>

        <value>hdfs://192.168.0.143:9000</value>   

      </property>

      <!--指定HADOOP运行时产生文件的存储目录-->

      <property>

        <name>hadoop.tmp.dir</name>

        <value>/data/hadoop/hdfs/meta</value>   

      </property>    

    </configuration>

    mkdir -p data/hadoop/hdfs/meta

    2、hdfs-site.xml

    datanode 配置成多个目录,每个目录存储的数据不一样。类似多个DataNode

    <?xml version="1.0" encoding="UTF-8"?>

    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

    <configuration>

      <!--指定HDFS副本的数量-->

      <property>

        <name>dfs.replication</name>

        <value>1</value>

      </property>

        <property>

           <name>dfs.name.dir</name>

           <value>/data/hadoop/hdfs/namenode</value>

        </property>

    <property>

           <name>dfs.datanode.data.dir</name>

           <value>/data/hadoop/hdfs/datanode1,/data/hadoop/hdfs/datanode2</value>

    </property>

    </configuration>

    3、yarn-site.xml

    <?xml version="1.0"?>

    <configuration>

        <!-- 指定YARN(ResourceManager)的地址-->

        <property>

            <name>yarn.resourcemanager.hostname</name>

            <value>192.168.0.143</value>

        </property>

        <!-- 指定reducer获取数据的方式,此处做了mapreduce和spark配置,如果没有spark可以先关闭spark_shuffle配置 -->

        <property>

            <name>yarn.nodemanager.aux-services</name>

            <value>mapreduce_shuffle,spark_shuffle</value>

        </property>

        <!-- spark shuffle的类 -->

        <property>

           <name>yarn.nodemanager.aux-services.spark_shuffle.class</name>

           <value>org.apache.spark.network.yarn.YarnShuffleService</value>

        </property>

        <!-- nodemanager的总内存量,我们服务器是8GB,此处配置大概6GB -->

        <property>

            <description>Amount of physical memory, in MB, that can be allocated for containers.</description>

            <name>yarn.nodemanager.resource.memory-mb</name>

            <value>6000</value>

        </property>

    <!-- yarn最小分配内存 -->

        <property>

            <description>The minimum allocation for every container request at the RM,

                         in MBs. Memory requests lower than this won't take effect,

                         and the specified value will get allocated at minimum.</description>

            <name>yarn.scheduler.minimum-allocation-mb</name>

            <value>512</value>

        </property>

        <!-- yarn最大分配内存 -->

        <property>

            <description>The maximum allocation for every container request at the RM,

                         in MBs. Memory requests higher than this won't take effect,

                         and will get capped to this value.</description>

            <name>yarn.scheduler.maximum-allocation-mb</name>

            <value>6000</value>

        </property>

     

        <!-- 聚合log 方便后面开启history服务 -->

        <property>

         <name>yarn.log-aggregation-enable</name>

         <value>true</value>

        </property>

        <property>

         <name>yarn.log-aggregation.retain-seconds</name>

         <value>2592000</value>

        </property>

        <!-- 历史服务器web地址 -->

        <property>

         <name>yarn.log.server.url</name>

         <value>http://192.168.0.143:8988/jobhistory/logs</value>

        </property>

        <!-- log数据存放hdfs地址 -->

        <property>

         <name>yarn.nodemanager.remote-app-log-dir</name>

         <value>hdfs://192.168.0.143:9000/user/root/yarn-logs/</value>

        </property>

        <!-- 此处cpu核数,我们机器是4核,此处不可多配,建议略小于系统CPU核数 -->

      <property>

        <description>Number of vcores that can be allocated

        for containers. This is used by the RM scheduler when allocating

        resources for containers. This is not used to limit the number of

        CPUs used by YARN containers. If it is set to -1 and

        yarn.nodemanager.resource.detect-hardware-capabilities is true, it is

        automatically determined from the hardware in case of Windows and Linux.

        In other cases, number of vcores is 8 by default.</description>

        <name>yarn.nodemanager.resource.cpu-vcores</name>

        <value>4</value>

      </property>

        <!-- cpu最小分配 -->

      <property>

        <description>The minimum allocation for every container request at the RM

        in terms of virtual CPU cores. Requests lower than this will be set to the

        value of this property. Additionally, a node manager that is configured to

        have fewer virtual cores than this value will be shut down by the resource

        manager.</description>

        <name>yarn.scheduler.minimum-allocation-vcores</name>

        <value>1</value>

      </property>

        <!-- cpu最大分配 -->

      <property>

        <description>The maximum allocation for every container request at the RM

        in terms of virtual CPU cores. Requests higher than this will throw an

        InvalidResourceRequestException.</description>

        <name>yarn.scheduler.maximum-allocation-vcores</name>

        <value>3</value>

      </property>

        <!-- classpath配置 -->

      <property>

        <name>yarn.application.classpath</name>

     <value>/ecapp/hadoop/etc/hadoop:/ecapp/hadoop/share/hadoop/common/lib/*:/ecapp/hadoop/share/hadoop/common/*:/ecapp/hadoop/share/hadoop/hdfs:/ecapp/hadoop/share/hadoop/hdfs/lib/*:/ecapp/hadoop/share/hadoop/hdfs/*:/ecapp/hadoop/share/hadoop/mapreduce/lib/*:/ecapp/hadoop/share/hadoop/mapreduce/*:/ecapp/hadoop/share/hadoop/yarn:/ecapp/hadoop/share/hadoop/yarn/lib/*:/ecapp/hadoop/share/hadoop/yarn/*</value>

      </property>

    </configuration>

    4、mapred-site.xml

    <?xml version="1.0"?>

    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

    <configuration>

    <!--指定mr运行在yarn上-->

    <property>

      <name>mapreduce.framework.name</name>

      <value>yarn</value>

    </property>

    <!-- 历史服务器配置,同步mapreduce任务到历史服务器 -->

    <property>

        <name>mapreduce.jobhistory.address</name>

        <value>192.168.0.143:10020</value>

    </property>

    <property>

        <name>mapreduce.jobhistory.webapp.address</name>

        <value>192.168.0.143:8988</value>

    </property>

    <!-- 历史服务器缓存任务数 -->

    <property>

        <name>mapreduce.jobhistory.joblist.cache.size</name>

        <value>5000</value>

    </property>

    </configuration>

    5、workers

    localhost   // 此处是配置的本机

    6、hadoop-env.sh 

    最后加入你的javahome

    JAVA_HOME=/ecapp/jdk

    三、linux环境配置

    1、/etc/profile文件配置

    export JAVA_HOME=/ecapp/jdk

    export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar

    # hadoop 相关配置

    export HADOOP_HOME=/ecapp/hadoop

    #export HADOOP_OPTS="-Djava.library.path=$HADOOP_PREFIX/lib:$HADOOP_PREFIX/lib/native"

    export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native

    export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native

    export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

    #export HADOOP_ROOT_LOGGER=DEBUG,console

    export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/sbin:$HADOOP_HOME/bin:$JAVA_HOME/bin

    #hadoop-3.1.0必须添加如下5个变量否则启动报错,hadoop-2.x貌似不需要

    export HDFS_NAMENODE_USER=root

    export HDFS_DATANODE_USER=root

    export HDFS_SECONDARYNAMENODE_USER=root

    export YARN_RESOURCEMANAGER_USER=root

    export YARN_NODEMANAGER_USER=root

    2、应用一下环境: source /etc/profile

    四、启动服务

    1、格式化NameNode

    命令: hdfs namenode -format

    中间没有报错并且最后显示如下信息表示格式化成功

    ...

    /************************************************************

    SHUTDOWN_MSG: Shutting down NameNode at ecs-6531-0002

    ************************************************************/

    如果格式化NameNode之后运行过hadoop,然后又想再格式化一次NameNode,那么需要先删除第一次运行Hadoop后产生的VERSION文件,否则会出错

    2、启动

    start-all.sh 启动所有服务,启动日志在hadoop软件目录的logs下

    3、jps查看服务进程

    6662 Jps

    9273 DataNode #hdfs worker节点

    5465 SecondaryNameNode #hdfs备份节点

    9144 NameNode #hdfs主节点

    9900 NodeManager #yarn的worker节点

    9575 ResourceManager #yarn的主节点

    4、启动历史任务服务器

    命令:mapred --daemon start historyserver

    jps看到

    12710 JobHistoryServer

    五、web页面

    hdfs地址:http://192.168.0.143:9870/dfshealth.html#tab-overview

     

  • 相关阅读:
    菜鸟系列docker——docker镜像下(5)
    菜鸟系列docker——docker镜像中(4)
    菜鸟系列docker——docker镜像上(3)
    菜鸟系列docker——docker仓库(2)
    菜鸟系列docker——docker基本概念(1)
    Postman工具内容梳理
    Fiddler抓包手机APP失败的处理
    微信
    微信文本的爬取
    如何写活类的装饰器
  • 原文地址:https://www.cnblogs.com/charon2/p/11315433.html
Copyright © 2011-2022 走看看