hadoop部署
- 准备好三台机器,规划配置如下
linux01 10.0.0.155 NameNode DataNode NodeManager
linux02 10.0.0.156 SecondaryNameNode DataNode NodeManager ResourceManager
linux04 10.0.0.161 DataNode NodeManager
1.java安装
# 我用的是:jdk1.8.0_281.tar.gz
# 直接在 /usr/local/java/ 下解压
[linux01 /usr/local/java ]# tar zxvf jdk1.8.0_281.tar.gz
- 添加环境变量
vi /etc/profile
export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
PATH=${JAVA_HOME}/bin:$PATH
export PATH
2.hosts配置
- 先给三台主机改下名字
hostnamectl set-hostname <名字>
bash # 刷新
- 三台主机配置hosts 映射
vi /etc/hosts
10.0.0.156 linux02
10.0.0.155 linux01
10.0.0.161 linux04
3.免密登录配置:
# 比如你在linux01机器上。输入如下命令,然后一路小回车即可
[linux01 ~]# ssh-keygen -t rsa
# 然后ssh-copy-id把本地的ssh公钥文件安装到远程主机.将公钥分配给linux02, linux04
[linux01 ~]# ssh-copy-id linux02
[linux01 ~]# ssh-copy-id linux04
# 更改权限:
[linux01 ~]# chmod 700 .ssh/
[linux01 ~]# chmod 600 .ssh/authorized_keys
# 还需要的是将公钥放到authorized_keys内,否则后面start-dfs.sh还会让你输入密码。链接原理:https://blog.csdn.net/xinneya/article/details/102767327
cat id_rsa.pub >> authorized_keys
4.下载hadoop
-
下载地址:http://archive.apache.org/dist/hadoop/core/hadoop-2.7.1/ 找到那个最大文件,以tar.gz结尾的,
hadoop-2.7.1.tar.gz
。 -
放到服务器解压
[linux01 ~]# tar zxvf hadoop-2.7.1.tar.gz
- 改个名字
[linux01 ~]# mv hadoop-2.7.1 hadoop
5.配置文件修改
- hadoop按照目录再
/root/hadoop
,我们进入到etc/hadoop
,也就是/root/hadoop/etc/hadoop
vim hdfs-site.xml
提前创建好name目录
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.rpc-address</name>
<value>linux01:9000</value>
<description></description>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/root/hadoop/name</value>
<description></description>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>linux02:50090</value>
<description></description>
</property>
</configuration>
# dfs.namenode.name.dir # namenode
# dfs.replication # 文件副本数量
# dfs.namenode.secondary.http-address SecondaryNameNode
hadoop-env.sh
修改为:export JAVA_HOME=/usr/local/java/jdk1.8.0_281/
core-site.xml
提前创建好 tmp目录
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://linux01:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/root/hadoop/tmp</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file://${hadoop.tmp.dir}/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file://${hadoop.tmp.dir}/dfs/data</value>
</property>
</configuration>
slaves
slaves 添加datanode
linux02
linux04
linux01
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>linux02</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>106800</value>
</property>
</configuration>
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>linux01:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>linux01:19888</value>
</property>
</configuration>
6.分发配置文件
- 将胚子和好文件分发给其他节点
scp -r hadoop/ linux02:$PWD
scp -r hadoop/ linux04:$PWD
- 配置hadoop和yarn环境变量
export HADOOP_HOME=/root/hadoop/
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
PATH=${JAVA_HOME}/bin:$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export PATH
- linux01节点(也就是master节点) format
dfs namenode -format
7.启动服务
- 启动dfs服务
start-dfs.sh
- 启动yarn
start-yarn.sh
- 启动yarn的web管理界面,这里在linux02 启动了ResourceManager
yarn-daemon.sh start resourcemanager
8.简单测试
- 上传文件
hdfs dfs -put t1.txt /
- dfs web: http://10.0.0.155:50070/
- yarn web: http://10.0.0.156:8088/
9.hadoop完全分布式配置
1.Namenode HA 搭建
hdfs-site.xml
修改
<configuration>
<!-- 完全分布式集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 集群中NameNode节点名称 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>linux01:8020</value>
</property>
<!-- nn2的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>linux02:8020</value>
</property>
<!-- nn1的http通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>linux01:50070</value>
</property>
<!-- nn2的http通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>linux02:50070</value>
</property>
<!-- 指定NameNode元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://linux01:8485;linux02:8485;linux04:8485/mycluster</value>
</property>
<!-- 访问代理类:client,mycluster,active配置失败自动切换实现方式-->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要ssh无秘钥登录-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!-- 声明journalnode服务器存储目录-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/root/hadoop/jn</value>
</property>
<!-- 关闭权限检查-->
<property>
<name>dfs.permissions.enable</name>
<value>false</value>
</property>
<!--开启Automatic Failover模式 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/root/hadoop/tmp</value>
</property>
<!-- 文件回收时间默认分钟 -->
<property>
<name>fs.trash.interval</name>
<value>2</value>
</property>
<!-- 垃圾回收的检查间隔小于fs.trash.interval -->
<property>
<name>fs.trash.checkpoint.interval</name>
<value>1</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>linux01:2181,linux02:2181,linux04:2181</value>
</property>
</configuration>
- 配置好内容发送给linux02和linux03
scp -r hadoop/ linux04:$PWD
scp -r hadoop/ linux02:$PWD
- 三台设备配置好zookeeper环境变量
export ZOOKEEPEERPATH=/opt/zookeeper-3.4.6
...:$ZOOKEEPEERPATH/bin
- 检查三台机器是否都启动zookeeper。
[root@linu01 bin]#zkServer.sh start
[root@linu02 bin]#zkServer.sh start
[root@linu04 bin]#zkServer.sh start
- linux01节点和linux02节点,linux04上启动QJM , journalnode主要是通过独立进程同步多个NameNode数据。这里journalnode要求启动奇数数量,并且至少3台。
[root@linux01 ~]# hadoop-daemon.sh start journalnode
# jps查看 是否启动成功
[root@linux01 ~]# jps
30604 JournalNode
[root@linux02 ~]# hadoop-daemon.sh start journalnode
[root@linux02 ~]# jps
9786 JournalNode
[root@linux04 ~]# hadoop-daemon.sh start journalnode
[root@linux04 ~]# jps
9781 JournalNode
- 将linux01的namenode格式化
[root@linux01 ~]# hdfs namenode -format
- 启动linux01的namenode
[root@linux01 ~]# hadoop-daemon.sh start namenode
# 检查
[root@linux01 ~]# jps
595 JournalNode
1242 NameNode
1692 QuorumPeerMain
1327 Jps
- 第二个namenode 同步第一个
[root@linux02 ~]# hdfs namenode -bootstrapStandby
- 启动第二个namenode
hadoop-daemon.sh start namenode
-
查看页面均已经启动:
-
linux01
-
linux02
-
手动强制切换
[root@linux01 ~]# hdfs haadmin -transitionToActive --forcemanual nn1
- 查看nn2,nn1状态 可以看到nn1为激活,nn2为准备
[root@linux01 ~]# hdfs haadmin -getServiceState nn1
active
[root@linux01 ~]# hdfs haadmin -getServiceState nn2
standby
- zookeeper设置故障自转移
[root@linux01 ~]# hdfs zkfc -formatZK
# 验证,进入zookeeper客户端:
[root@linux01 ~]# zkCli.sh -server linux01:2181
[zk: linux01:2181(CONNECTED) 0] ls /
[..hadoop-ha..]
# 可以看到hadoop-ha 代表成功
- 集群启动
[root@linux01 ~]# start-dfs.sh
- 三台机器JPS启动状态
[root@linux01 ~]# jps
4256 DFSZKFailoverController
3745 NameNode
1692 QuorumPeerMain
4316 Jps
3854 DataNode
4062 JournalNode
[root@linux02 ~]# jps
19936 JournalNode
19812 DataNode
30230 QuorumPeerMain
20076 DFSZKFailoverController
20190 Jps
19742 NameNode
[root@linux04 hadoop]# jps
11922 Jps
11620 JournalNode
11404 DataNode
6605 QuorumPeerMain
- 验证
# 查看2个namenode状态
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn2
active
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
standby
# 可以看到nn1 为准备状态,nn2为运行状态,杀掉nn2的 namenode
[root@linux02 ~]# kill -9 19742
# 查看nn1状态,为激活状态
[root@linux01 hadoop]# hdfs haadmin -getServiceState nn1
active
# 查看nn2状态, 已经无法连接
# 重新启动同步nn1,并启动nn2
[root@linux02 ~]# hdfs namenode -bootstrapStandby
[root@linux02 ~]# hadoop-daemon.sh start namenode
# 查看重启nn2状态,准备中
[root@linux02 ~]# hdfs haadmin -getServiceState nn2
standby
# 杀死nn1,看会不会将namenode切换nn2
[root@linux01 ~]# kill -9 3745
[root@linux01 ~]# hdfs haadmin -getServiceState nn2
active
# 切换到nn2上
2.Yarn的HA搭建
- 修改
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!--日志聚合-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!--任务历史服务-->
<property>
<name>yarn.log.server.url</name>
<value>http://linux01:19888/jobhistory/logs/</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>86400</value>
</property>
<!--启用resourcemanager ha-->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!--声明两台resourcemanager的地址-->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster-yarn1</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>linux02</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>linux04</value>
</property>
<!--指定zookeeper集群的地址-->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>linux01:2181,linux02:2181,linux03:2181,linux04:2181</value>
</property>
<!--启用自动恢复-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!--指定resourcemanager的状态信息存储在zookeeper集群-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
</configuration>
- 我在linux02和linux04配置ResourceManager 所以在linux02启动
# linux02启动yarn
[root@linux02 ~]# start-yarn.sh
# jps查看,可以看到ResourceManager已经启动
[root@linux02 ~]# jps
7298 NameNode
8355 ResourceManager
30230 QuorumPeerMain
7756 NodeManager
7484 JournalNode
7372 DataNode
8717 Jps
7630 DFSZKFailoverController
# 启动linux04的yarn
[root@linux04 ~]# start-yarn.sh
# jps查看,可以看到ResourceManager已经启动
[root@linux04 ~]# jps
28293 JournalNode
5754 Jps
27931 DataNode
5532 ResourceManager
6605 QuorumPeerMain
28622 NodeManager
- 查看ResourceManager的状态
[root@linux01 current]# yarn rmadmin -getServiceState rm1
active
[root@linux01 current]# yarn rmadmin -getServiceState rm2
standby
# 可以看到rm1为活跃状态,rm2为准备状态
- 验证:
# 杀死rm1,rm1在linux02上
[root@linux02 ~]# kill -9 8355
# 查看rm1和rm2状态
[root@linux02 ~]# yarn rmadmin -getServiceState rm2
active
[root@linux02 ~]# yarn rmadmin -getServiceState rm1
21/06/18 09:12:29 INFO ipc.Client: Retrying connect to server: linux02/10.0.0.156:8033. Already tried 0 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=1, sleepTime=1000 MILLISECONDS)
Operation failed: Call From linux02/10.0.0.156 to linux02:8033 failed on connection exception: java.net.ConnectException: 拒绝连接; For more details see: http://wiki.apache.org/hadoop/ConnectionRefused
# 此时rm2为活跃状态而rm1无法连接
# 重启rm1,并杀死rm2看yarn是否能切换到rm1
[root@linux02 ~]# start-yarn.sh
# 此时rm2为激活状态,rm1为准备状态
[root@linux02 ~]# yarn rmadmin -getServiceState rm2
active
[root@linux02 ~]# yarn rmadmin -getServiceState rm1
standby
# 杀死rm2
[root@linux04 ~]# kill -9 5532
# 查看rm1
[root@linux04 ~]# yarn rmadmin -getServiceState rm1
active
# 此时rm1也成功切换
- 参考文献:
https://blog.csdn.net/baidu_28997655/article/details/81906591
https://blog.csdn.net/oschina_41140683/article/details/80332080
https://www.notion.so/Spark-ed90b76b7c83408990a2959b0030c2fb