cha04 hadoop ha

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# 时间同步
yum -y install ntpdate.x86_64
ntpdate -u cn.pool.ntp.org

#解压并重命名
tar -zxvf /opt/download/hadoop-3.1.3.tar.gz -C /opt/software

#环境变量并激活
vim /etc/profile.d/my.sh
#---------------------------------------------------
# hadoop
export HADOOP_HOME=/opt/software/hadoop-3.1.3
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HADOOP_HOME/lib
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export HDFS_JOURNALNODE_USER=root
export HDFS_ZKFC_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_CONF_DIR=$HADOOP_HOME
export HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec
export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
#---------------------------------------------------
source /etc/profile

#创建数据目录
cd /opt/software/hadoop-3.1.3
mkdir data

#HAOOP内部配置
cd /opt/software/hadoop-3.1.3/etc/hadoop

vim hadoop-env.sh
#---------------------------------------------------
export JAVA_HOME=/opt/software/jdk-1.8.0
#---------------------------------------------------

vim workers
#---------------------------------------------------
master01
master02
worker01
worker02
#---------------------------------------------------
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
<!-- vim core-site.xml -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://kb19cluster</value>
<description>逻辑名称,必须hdfs-site.xml中dfs.nameservices值保持一致</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/tmp/hadoop/kb19cluster</value>
<description>namenode上本地的hadoop临时文件夹</description>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<!--1048576后可能有问题<202c>-->
<property>
<name>io.file.buffer.size</name>
<value>1048576‬</value>
<description>Size of read/write SequenceFiles buffer: 128K</description>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>master01:2181,master02:2181,worker01:2181</value>
</property>
<property>
<name>hadoop.zk.address</name>
<value>master01:2181,master02:2181,worker01:2181</value>
</property>
<property>
<name>ha.zookeeper.session-timeout.ms</name>
<value>10000</value>
<description>hadoop链接zookeeper的超时时长设置ms</description>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<!-- vim hdfs-site.xml -->
<property>
<name>dfs.replication</name>
<value>2</value>
<description>Hadoop中每个block的备份数</description>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/software/hadoop-3.1.3/data/dfs/name</value>
<description>namenode上存储hdfs名字空间元数据 </description>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/software/hadoop-3.1.3/data/dfs/data</value>
<description>datanode上数据块的物理存储位置</description>
</property>
<property>
<name>dfs.nameservices</name>
<value>kb19cluster</value>
<description>指定hdfs的nameservice,需要和core-site.xml中的保持一致</description>
</property>
<property>
<name>dfs.ha.namenodes.kb19cluster</name>
<value>nn1,nn2</value>
<description>kb16为集群逻辑名称,映射两个namenode逻辑名称</description>
</property>
<property>
<name>dfs.namenode.rpc-address.kb19cluster.nn1</name>
<value>master01:8020</value>
<description>master01的RPC通信地址</description>
</property>
<property>
<name>dfs.namenode.http-address.kb19cluster.nn1</name>
<value>master01:9870</value>
<description>master01的http通信地址</description>
</property>
<property>
<name>dfs.namenode.rpc-address.kb19cluster.nn2</name>
<value>master02:8020</value>
<description>master02的RPC通信地址</description>
</property>
<property>
<name>dfs.namenode.http-address.kb19cluster.nn2</name>
<value>master02:9870</value>
<description>master02的http通信地址</description>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://master01:8485;master02:8485;worker01:8485/kb19cluster</value>
<description>指定NameNode的edits元数据的共享存储位置(JournalNode列表)</description>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/tmp/hadoop/journaldata</value>
<description>指定JournalNode在本地磁盘存放数据的位置</description>
</property>

<!--容错 dfs.ha.zkfc.locker.address-->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
<description>开启NameNode失败自动切换</description>
</property>
<property>
<name>dfs.client.failover.proxy.provider.kb19cluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
<description>配置失败自动切换实现方式</description>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
<description>脑裂处理</description>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
<description>使用sshfence隔离机制时,需要ssh免密登陆</description>
</property>

<!--权限设定避免因权限问题导致操作失败异常-->
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
<description>关闭权限验证</description>
</property>

<!--限流将更多的内存和带宽让给job-->
<property>
<name>dfs.image.transfer.bandwidthPerSec</name>
<value>1048576</value>
</property>
<property>
<name>dfs.block.scanner.volume.bytes.per.second</name>
<value>1048576</value>
</property>
<property>
<name>dfs.datanode.balance.bandwidthPerSec</name>
<value>20m</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
<!-- vim mapred-site.xml -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
<description>job执行框架:local, classic or yarn.</description>
<final>true</final>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>{hadoop classpath}</value>
</property>
<!--job history单节点配置即可-->
<property>
<name>mapreduce.jobhistory.address</name>
<value>master01:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>master01:19888</value>
</property>
<!--Container内存上限,由nodemanager读取并控制,实际使用超出时会被nodemanager kill Connection reset by peer-->
<property>
<name>mapreduce.map.memory.mb</name>
<value>256</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>512</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
<!-- vim yarn-site.xml -->
<!-- Resource Manager Configs -->
<!-- 容错 -->
<property>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>10000</value>
</property>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>

<!-- ResourceManager重启容错 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
<description>RM 重启过程中不影响正在运行的作业</description>
</property>

<!-- 应用的状态信息存储方案:ZK -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
<description>应用的状态等信息保存方式:ha只支持ZKRMStateStore</description>
</property>

<!-- yarn集群配置 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>kb19cluster</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
<name>yarn.resourcemanager.work-preserving-recovery.enabled</name>
<value>true</value>
</property>

<!-- rm1 configs -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>master01</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>master01:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>master01:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm1</name>
<value>master01:8090</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>master01:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>master01:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>master01:8033</value>
</property>

<!-- rm2 configs -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>master02</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>master02:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>master02:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm2</name>
<value>master02:8090</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>master02:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>master02:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>master02:8033</value>
</property>

<!-- Node Manager Configs 每个节点都要配置 -->
<property>
<description>Address where the localizer IPC is. ********* </description>
<name>yarn.nodemanager.localizer.address</name>
<value>master01:8040</value>
</property>
<property>
<description>Address where the localizer IPC is. ********* </description>
<name>yarn.nodemanager.address</name>
<value>master01:8050</value>
</property>
<property>
<description>NM Webapp address. ********* </description>
<name>yarn.nodemanager.webapp.address</name>
<value>master01:8042</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/tmp/hadoop/yarn/local</value>
</property>
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/tmp/hadoop/yarn/log</value>
</property>

<!--资源优化-->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>2</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>256</value>
</property>

<!--日志聚合-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>86400</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>{hadoop classpath}</value>
</property>
初始化并启动
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# 启动 zookeeper 集群
zkServer.sh start # *4
zkServer.sh status # 1 leader + 3 followers
# 启动 journalnode 集群
hdfs --daemon start journalnode # *4
#格式化zkfc
hdfs zkfc -formatZK
#主NN节点格式化
hdfs namenode -format
#首次启动集群
start-all.sh
#从NN节点格式化和启动
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode
#查看服务
bash ~/scall.sh jps yes
-------------------
| master01 RESULT |
-------------------
3696 QuorumPeerMain
7553 JournalNode
9106 Jps
7720 DFSZKFailoverController
8745 NameNode
8282 NodeManager
8140 ResourceManager
7310 DataNode
-------------------
| master02 RESULT |
-------------------
4278 JournalNode
4550 NodeManager
5463 NameNode
4473 ResourceManager
4204 DataNode
6092 Jps
3037 QuorumPeerMain
5549 ZooKeeperMain
4382 DFSZKFailoverController
-------------------
| worker01 RESULT |
-------------------
3016 QuorumPeerMain
3849 JournalNode
3946 NodeManager
4218 Jps
3758 DataNode
-------------------
| worker02 RESULT |
-------------------
3168 QuorumPeerMain
4036 NodeManager
3943 JournalNode
4313 Jps
3852 DataNode

#第二次开始启动方式
hdfs --daemon start|stop namenode|datanode|journalnode|zkfc|secondarynamenode
yarn-daemon.sh start|stop resourcemanager|nodemanager

#启动hdfs和yarn方式
start-dfs.sh # master01
start-yarn.sh # master01

#启动job historyserver方式
mapred --deamon start historyserver # master01
mapred --deamon stop historyserver # master01

#关闭hdfs和yarn方式
stop-yarn.sh # master01
stop-dfs.sh # master01

#查看 nn1和nn2的状态
hdfs haadmin -getServiceState nn1 # standby
hdfs haadmin -getServiceState nn2 # active
#手动激活 active
hdfs haadmin -transitionToActive --forcemanual nn1 # 有active则拒绝操作

# 验证高可用
#关闭主节点(master01)的namenode,查看master02的namenode是否被激活(standby->active)
hdfs --daemon stop namenode

#重启主节点(master01)的namenode,状态为standby
hdfs --daemon start namenode

# 【第二组心跳:nn1|nn2 =存储nn信息,监控nn状态=> zkcluster】

https://leaf-domain.gitee.io/2024/07/17/hadoop_ha_install/
作者
叶域
发布于
2024年7月17日
许可协议