zoukankan      html  css  js  c++  java
  • 搭建一套高可用的hadoop集群

    hadoop部署

    • 准备好三台机器,规划配置如下
    linux01 10.0.0.155   NameNode DataNode NodeManager
    linux02 10.0.0.156   SecondaryNameNode  DataNode  NodeManager ResourceManager
    linux04 10.0.0.161   DataNode  NodeManager
    

    1.java安装

    # 我用的是:jdk1.8.0_281.tar.gz
    # 直接在 /usr/local/java/ 下解压
    [linux01 /usr/local/java ]# tar zxvf jdk1.8.0_281.tar.gz
    
    • 添加环境变量
    vi /etc/profile
    export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
    export JRE_HOME=${JAVA_HOME}/jre
    export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
    PATH=${JAVA_HOME}/bin:$PATH
    export PATH
    

    2.hosts配置

    • 先给三台主机改下名字
    hostnamectl set-hostname  <名字>
    bash # 刷新
    
    • 三台主机配置hosts 映射
    vi /etc/hosts
    10.0.0.156 linux02
    10.0.0.155 linux01
    10.0.0.161 linux04
    

    3.免密登录配置:

    # 比如你在linux01机器上。输入如下命令,然后一路小回车即可
    [linux01 ~]# ssh-keygen -t rsa
    # 然后ssh-copy-id把本地的ssh公钥文件安装到远程主机.将公钥分配给linux02, linux04
    [linux01 ~]# ssh-copy-id linux02
    [linux01 ~]# ssh-copy-id linux04
    # 更改权限:
    [linux01 ~]# chmod 700 .ssh/
    [linux01 ~]# chmod 600 .ssh/authorized_keys
    # 还需要的是将公钥放到authorized_keys内,否则后面start-dfs.sh还会让你输入密码。链接原理:https://blog.csdn.net/xinneya/article/details/102767327
    cat id_rsa.pub >> authorized_keys
    

    4.下载hadoop

    [linux01 ~]# tar zxvf hadoop-2.7.1.tar.gz
    
    • 改个名字
    [linux01 ~]# mv hadoop-2.7.1 hadoop  
    

    5.配置文件修改

    • hadoop按照目录再/root/hadoop,我们进入到etc/hadoop,也就是/root/hadoop/etc/hadoop
    • vim hdfs-site.xml 提前创建好name目录
    <configuration>
        <property>
            <name>dfs.replication</name>
            <value>3</value>
        </property>
        <property>
            <name>dfs.namenode.rpc-address</name>
            <value>linux01:9000</value>
            <description></description>
        </property>
        <property>
            <name>dfs.namenode.name.dir</name>
            <value>/root/hadoop/name</value>
            <description></description>
        </property>
        <property>
            <name>dfs.namenode.secondary.http-address</name>
            <value>linux02:50090</value>
            <description></description>
        </property>
    </configuration>
    # dfs.namenode.name.dir # namenode
    # dfs.replication  # 文件副本数量
    # dfs.namenode.secondary.http-address   SecondaryNameNode
    
    • hadoop-env.sh
    修改为:export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
    
    • core-site.xml 提前创建好 tmp目录
    <configuration>
            <property>
                <name>fs.defaultFS</name>
                <value>hdfs://linux01:9000</value>
            </property>
            <property>
                <name>hadoop.tmp.dir</name>
                <value>/root/hadoop/tmp</value>
            </property>
            <property>
                <name>dfs.namenode.name.dir</name>
                <value>file://${hadoop.tmp.dir}/dfs/name</value>
            </property>
            <property>
                <name>dfs.datanode.data.dir</name>
                <value>file://${hadoop.tmp.dir}/dfs/data</value>
            </property>
    </configuration>
    
    • slaves slaves 添加datanode
    linux02
    linux04
    linux01
    
    • yarn-site.xml
    <configuration>
      <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
      </property>
      <property>
        <name>yarn.resourcemanager.hostname</name>    
        <value>linux02</value>                          
      </property>  
      <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
      </property>
      <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>106800</value>
      </property>
    </configuration>
    
    • mapred-site.xml
    <configuration>
            <property>
                <name>mapreduce.framework.name</name>
                <value>yarn</value>
            </property>
            <property>
                <name>mapreduce.jobhistory.address</name>
                <value>linux01:10020</value>
            </property>
            <property>
                <name>mapreduce.jobhistory.webapp.address</name>
                <value>linux01:19888</value>
            </property>
    </configuration>
    

    6.分发配置文件

    • 将胚子和好文件分发给其他节点
    scp -r hadoop/ linux02:$PWD
    scp -r hadoop/ linux04:$PWD
    
    • 配置hadoop和yarn环境变量
    export HADOOP_HOME=/root/hadoop/
    export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
    PATH=${JAVA_HOME}/bin:$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
    export PATH
    
    • linux01节点(也就是master节点) format
    dfs namenode -format
    

    7.启动服务

    • 启动dfs服务
    start-dfs.sh
    
    • 启动yarn
    start-yarn.sh
    
    • 启动yarn的web管理界面,这里在linux02 启动了ResourceManager
    yarn-daemon.sh start resourcemanager
    

    8.简单测试

    • 上传文件
    hdfs dfs -put t1.txt /
    

    9.hadoop完全分布式配置

    1.Namenode HA 搭建

    • hdfs-site.xml 修改
    <configuration>
        <!-- 完全分布式集群名称 -->
        <property>
            <name>dfs.nameservices</name>
            <value>mycluster</value>
        </property>
        <!-- 集群中NameNode节点名称 -->
        <property>
            <name>dfs.ha.namenodes.mycluster</name>
            <value>nn1,nn2</value>
        </property>
        <!-- nn1的RPC通信地址 -->
        <property>
          <name>dfs.namenode.rpc-address.mycluster.nn1</name>
          <value>linux01:8020</value>
        </property>
        <!-- nn2的RPC通信地址 -->
        <property>
          <name>dfs.namenode.rpc-address.mycluster.nn2</name>
          <value>linux02:8020</value>
        </property>
        <!-- nn1的http通信地址 -->
        <property>
            <name>dfs.namenode.http-address.mycluster.nn1</name>
            <value>linux01:50070</value>
        </property>
    
        <!-- nn2的http通信地址 -->
        <property>
              <name>dfs.namenode.http-address.mycluster.nn2</name>
              <value>linux02:50070</value>
        </property>
        <!-- 指定NameNode元数据在JournalNode上的存放位置 -->
        <property>
              <name>dfs.namenode.shared.edits.dir</name>
                <value>qjournal://linux01:8485;linux02:8485;linux04:8485/mycluster</value>
        </property>
        <!-- 访问代理类:client,mycluster,active配置失败自动切换实现方式-->
        <property>
              <name>dfs.client.failover.proxy.provider.mycluster</name>
              <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
        </property>
        <!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
        <property>
              <name>dfs.ha.fencing.methods</name>
              <value>sshfence</value>
        </property>
        <!-- 使用隔离机制时需要ssh无秘钥登录-->
        <property>
              <name>dfs.ha.fencing.ssh.private-key-files</name>
              <value>/root/.ssh/id_rsa</value>
        </property>
        <!-- 声明journalnode服务器存储目录-->
        <property>
              <name>dfs.journalnode.edits.dir</name>
              <value>/root/hadoop/jn</value>
        </property>
        <!-- 关闭权限检查-->
        <property>
            <name>dfs.permissions.enable</name>
            <value>false</value>
        </property>
        <!--开启Automatic Failover模式 -->
        <property>
            <name>dfs.ha.automatic-failover.enabled</name>
            <value>true</value>
        </property>
    </configuration>
    
    • core-site.xml
    <configuration>
            <property>
                    <name>fs.defaultFS</name>
                    <value>hdfs://mycluster</value>
            </property>
            <property>
                      <name>hadoop.tmp.dir</name>
                      <value>/root/hadoop/tmp</value>
            </property>
            <!-- 文件回收时间默认分钟 -->
            <property>
                     <name>fs.trash.interval</name>
                     <value>2</value>
            </property>
            <!-- 垃圾回收的检查间隔小于fs.trash.interval -->
            <property>
                    <name>fs.trash.checkpoint.interval</name>
                    <value>1</value>
            </property>
            <property>
                    <name>hadoop.http.staticuser.user</name>
                    <value>root</value>
            </property>
            <property>
                    <name>ha.zookeeper.quorum</name>
                    <value>linux01:2181,linux02:2181,linux04:2181</value>
             </property>
    </configuration>
    
    • 配置好内容发送给linux02和linux03
    scp -r hadoop/ linux04:$PWD
    scp -r hadoop/ linux02:$PWD
    
    • 三台设备配置好zookeeper环境变量
    export ZOOKEEPEERPATH=/opt/zookeeper-3.4.6
    ...:$ZOOKEEPEERPATH/bin
    
    • 检查三台机器是否都启动zookeeper。
    [root@linu01 bin]#zkServer.sh start
    [root@linu02 bin]#zkServer.sh start
    [root@linu04 bin]#zkServer.sh start
    
    • linux01节点和linux02节点,linux04上启动QJM , journalnode主要是通过独立进程同步多个NameNode数据。这里journalnode要求启动奇数数量,并且至少3台。
    [root@linux01 ~]# hadoop-daemon.sh start journalnode
    # jps查看 是否启动成功
    [root@linux01 ~]# jps
    30604 JournalNode
    
    
    [root@linux02 ~]# hadoop-daemon.sh start journalnode
    [root@linux02 ~]# jps
    9786 JournalNode
    
    [root@linux04 ~]# hadoop-daemon.sh start journalnode
    [root@linux04 ~]# jps
    9781 JournalNode
    
    • 将linux01的namenode格式化
    [root@linux01 ~]# hdfs namenode -format
    
    • 启动linux01的namenode
    [root@linux01 ~]# hadoop-daemon.sh start namenode
    # 检查
    [root@linux01 ~]# jps
    595 JournalNode
    1242 NameNode
    1692 QuorumPeerMain
    1327 Jps
    
    • 第二个namenode 同步第一个
    [root@linux02 ~]# hdfs namenode -bootstrapStandby
    
    • 启动第二个namenode
    hadoop-daemon.sh start namenode
    
    • 查看页面均已经启动:

    • linux01

    • linux02

    • 手动强制切换

    [root@linux01 ~]# hdfs haadmin -transitionToActive --forcemanual nn1
    
    • 查看nn2,nn1状态 可以看到nn1为激活,nn2为准备
    [root@linux01 ~]# hdfs haadmin -getServiceState nn1
    active
    [root@linux01 ~]# hdfs haadmin -getServiceState nn2
    standby
    
    • zookeeper设置故障自转移
    [root@linux01 ~]# hdfs zkfc -formatZK
    
    # 验证,进入zookeeper客户端:
    [root@linux01 ~]# zkCli.sh -server linux01:2181
    [zk: linux01:2181(CONNECTED) 0] ls /
    [..hadoop-ha..]
    # 可以看到hadoop-ha 代表成功
    
    • 集群启动
    [root@linux01 ~]# start-dfs.sh
    
    • 三台机器JPS启动状态
    [root@linux01 ~]# jps
    4256 DFSZKFailoverController
    3745 NameNode
    1692 QuorumPeerMain
    4316 Jps
    3854 DataNode
    4062 JournalNode
    [root@linux02 ~]# jps
    19936 JournalNode
    19812 DataNode
    30230 QuorumPeerMain
    20076 DFSZKFailoverController
    20190 Jps
    19742 NameNode
    [root@linux04 hadoop]# jps
    11922 Jps
    11620 JournalNode
    11404 DataNode
    6605 QuorumPeerMain
    
    • 验证
    # 查看2个namenode状态
    [root@linux01 hadoop]# hdfs haadmin -getServiceState nn2
    active
    [root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
    standby
    # 可以看到nn1 为准备状态,nn2为运行状态,杀掉nn2的 namenode
    [root@linux02 ~]# kill -9 19742
    # 查看nn1状态,为激活状态
    [root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
    active
    # 查看nn2状态, 已经无法连接
    # 重新启动同步nn1,并启动nn2
    [root@linux02 ~]# hdfs namenode -bootstrapStandby
    [root@linux02 ~]# hadoop-daemon.sh start namenode
    # 查看重启nn2状态,准备中
    [root@linux02 ~]# hdfs haadmin -getServiceState nn2
    standby
    # 杀死nn1,看会不会将namenode切换nn2
    [root@linux01 ~]# kill -9 3745
    [root@linux01 ~]# hdfs haadmin -getServiceState nn2
    active
    # 切换到nn2上
    

    2.Yarn的HA搭建

    • 修改yarn-site.xml
    <configuration>
    	<property>
    		<name>yarn.nodemanager.aux-services</name>
    		<value>mapreduce_shuffle</value>
    	</property>
    	<!--日志聚合-->
    	<property>
    		<name>yarn.log-aggregation-enable</name>
    		<value>true</value>
    	</property>
    	<!--任务历史服务-->
    	<property>
    		<name>yarn.log.server.url</name>
    		<value>http://linux01:19888/jobhistory/logs/</value>
    	</property>
    	<property>
    		<name>yarn.log-aggregation.retain-seconds</name>
    		<value>86400</value>
    	</property>
    	<!--启用resourcemanager ha-->
    	<property>
    		<name>yarn.resourcemanager.ha.enabled</name>
    		<value>true</value>
    	</property>
    	<!--声明两台resourcemanager的地址-->
    	<property>
    		<name>yarn.resourcemanager.cluster-id</name>
    		<value>cluster-yarn1</value>
    	</property>
    	<property>
    		<name>yarn.resourcemanager.ha.rm-ids</name>
    		<value>rm1,rm2</value>
    	</property>
    	<property>
    		<name>yarn.resourcemanager.hostname.rm1</name>
    		<value>linux02</value>
    	</property>
    	<property>
    		<name>yarn.resourcemanager.hostname.rm2</name>
    		<value>linux04</value>
    	</property>
    	<!--指定zookeeper集群的地址-->
    	<property>
    		<name>yarn.resourcemanager.zk-address</name>
    		<value>linux01:2181,linux02:2181,linux03:2181,linux04:2181</value>
    	</property>
    	<!--启用自动恢复-->
    	<property>
    		<name>yarn.resourcemanager.recovery.enabled</name>
    		<value>true</value>
    	</property>
    	<!--指定resourcemanager的状态信息存储在zookeeper集群-->
    	<property>
    		<name>yarn.resourcemanager.store.class</name>     
    		<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    	</property>
    </configuration>
    
    • 我在linux02和linux04配置ResourceManager 所以在linux02启动
    # linux02启动yarn
    [root@linux02 ~]# start-yarn.sh
    # jps查看,可以看到ResourceManager已经启动
    [root@linux02 ~]# jps
    7298 NameNode
    8355 ResourceManager
    30230 QuorumPeerMain
    7756 NodeManager
    7484 JournalNode
    7372 DataNode
    8717 Jps
    7630 DFSZKFailoverController
    # 启动linux04的yarn
    [root@linux04 ~]# start-yarn.sh
    # jps查看,可以看到ResourceManager已经启动
    [root@linux04 ~]# jps
    28293 JournalNode
    5754 Jps
    27931 DataNode
    5532 ResourceManager
    6605 QuorumPeerMain
    28622 NodeManager
    
    • 查看ResourceManager的状态
    [root@linux01 current]# yarn rmadmin -getServiceState rm1
    active
    [root@linux01 current]# yarn rmadmin -getServiceState rm2
    standby
    # 可以看到rm1为活跃状态,rm2为准备状态
    
    • 验证:
    # 杀死rm1,rm1在linux02上
    [root@linux02 ~]# kill -9 8355
    # 查看rm1和rm2状态
    [root@linux02 ~]# yarn rmadmin -getServiceState rm2
    active
    [root@linux02 ~]# yarn rmadmin -getServiceState rm1
    21/06/18 09:12:29 INFO ipc.Client: Retrying connect to server: linux02/10.0.0.156:8033. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=1, sleepTime=1000 MILLISECONDS)
    Operation failed: Call From linux02/10.0.0.156 to linux02:8033 failed on connection exception: java.net.ConnectException: 拒绝连接; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused
    # 此时rm2为活跃状态而rm1无法连接
    
    
    # 重启rm1,并杀死rm2看yarn是否能切换到rm1
    [root@linux02 ~]# start-yarn.sh
    # 此时rm2为激活状态,rm1为准备状态
    [root@linux02 ~]# yarn rmadmin -getServiceState rm2
    active
    [root@linux02 ~]# yarn rmadmin -getServiceState rm1
    standby
    # 杀死rm2
    [root@linux04 ~]# kill -9 5532
    # 查看rm1
    [root@linux04 ~]# yarn rmadmin -getServiceState rm1
    active
    # 此时rm1也成功切换
    
    • 参考文献:

    https://blog.csdn.net/baidu_28997655/article/details/81906591

    https://blog.csdn.net/oschina_41140683/article/details/80332080

    https://www.notion.so/Spark-ed90b76b7c83408990a2959b0030c2fb

    https://blog.csdn.net/mashuai720/article/details/80097217

  • 相关阅读:
    简单聚合查询
    简单搜索入门
    简单的document操作
    快速检测集群的健康状况
    Log4j和Slf4j的比较
    javascript中escape()、unescape()、encodeURI()、encodeURIComponent()、decodeURI()、decodeURIComponent()比较
    Spring-data-jpa详解,全方位介绍。
    JSON关联属性转换异常
    原生类型 和 参数化类型
    Spring Data JPA
  • 原文地址:https://www.cnblogs.com/xujunkai/p/14898175.html
Copyright © 2011-2022 走看看