阿里云搭建

配置集群免密连接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 配置集群ip映射
vim /etc/hosts
# 配置hadoop集群,尽量使用私网,公网中namenode和ResourceManager无法启动
# 报错:ERROR org.apache.hadoop.ha.ZKFailoverController: The failover controller encounters runtime error: java.net.BindException: Problem binding to [master01:8019] java.net.BindException: Cannot assign requested address;
# 解决:将本机的映射改为私网ip, 其他不变
-------------------------------
101.37.71.111 zyh
118.31.67.112 why
101.37.161.4 zzy
101.37.77.203 zjy
------------------------------
172.27.47.164 zyh
------------------------------

# 集群免密连接,全部执行
ssh-keygen -t rsa

ssh-copy-id root@zyh
ssh-copy-id root@why
ssh-copy-id root@zzy
ssh-copy-id root@zjy

# 密码
101.37.71.111 Zyh123456789
118.31.67.112 Why12345
101.37.161.4 Zzy147258369
101.37.77.203 Zhou123456

# 创建安装软件的文件夹
mkdir -p /opt/download /opt/software

# 上传各种安装包到一台,配置好后分发到其他台
scp xxxx root@zyh:/opt/download

# 上传环境变量,并激活
scp /etc/profile.d/myenv.sh root@zyh:/etc/profile.d/
source /etc/profile

zookeeper

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# zookeeper根目录下创建数据目录
mkdir data

# 编辑zookeeper server的编号
vim data/myid
----------
1
----------

# 编辑zookeeper配置文件
mv conf/zoo_sample.cfg conf/zoo.cfg
vim conf/zoo.cfg
----------------------------------
dataDir=/opt/software/zookeeper-3.6.3/data
quorumListenOnAllIPs=true # 见下方
clientPort=2181
server.1=zzy:2888:3888
server.2=zyh:2888:3888
server.3=why:2888:3888
----------------------------------

# 公网中会报错:
#
# 解决:追加配置
echo 'quorumListenOnAllIPs=true' >> /opt/software/zookeeper-3.6.3/conf/zoo.cfg

# quorumListenOnAllIPs 是ZooKeeper配置中的一个参数,用于控制ZooKeeper在集群内部通信时绑定的IP地址。默认情况下(或者当该参数未设置时),ZooKeeper会绑定到server.X配置项中指定的IP地址上。然而,在某些情况下,例如当ZooKeeper服务器位于具有多个网络接口(如虚拟机或云环境中的实例)的机器上时,直接指定一个IP地址可能会导致问题,因为该IP地址可能不是所有集群成员都能访问到的。

hadoop 高可用(HA 多个namenode)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# 时间同步
yum -y install ntpdate.x86_64
ntpdate -u cn.pool.ntp.org
#解压并重命名
#环境变量并激活
#---------------------------------------------------
# hadoop
export HADOOP_HOME=/opt/software/hadoop-3.1.3
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HADOOP_HOME/lib
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export HDFS_JOURNALNODE_USER=root
export HDFS_ZKFC_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_CONF_DIR=$HADOOP_HOME
export HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec
export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
#---------------------------------------------------
source /etc/profile

#创建数据目录
cd /opt/software/hadoop-3.1.3
mkdir data

#HAOOP内部配置
cd /opt/software/hadoop-3.1.3/etc/hadoop

vim hadoop-env.sh
#---------------------------------------------------
export JAVA_HOME=/opt/software/opt/software/jdk1.8.0_171
#---------------------------------------------------

vim workers
#---------------------------------------------------
zyh
why
zzy
#---------------------------------------------------
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
<!-- vim core-site.xml -->

<!-- 把多个NameNode的地址组装成一个集群mycluster -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 指定hadoop运行时产生文件的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/software/hadoop-3.1.3/data</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>why:2181,zyh:2181,zzy:2181</value>
</property>
<property>
<name>hadoop.zk.address</name>
<value>why:2181,zyh:2181,zzy:2181</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
<!-- vim hdfs-site.xml -->
<!-- NameNode数据存储目录 -->
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/software/hadoop-3.1.3/data/name</value>
</property>
<!-- DataNode数据存储目录 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/software/hadoop-3.1.3/data/data</value>
</property>
<!-- JournalNode数据存储目录 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/opt/software/hadoop-3.1.3/data/jn</value>
</property>
<!-- 完全分布式集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 集群中NameNode节点都有哪些 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2,nn3</value>
</property>
<!-- NameNode的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>zyh:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>why:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn3</name>
<value>zzy:8020</value>
</property>
<!-- NameNode的http通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>zyh:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>why:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn3</name>
<value>zzy:9870</value>
</property>
<!-- 指定NameNode元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://zyh:8485;why:8485;zzy:8485/mycluster</value>
</property>
<!-- 访问代理类:client用于确定哪个NameNode为Active -->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要ssh秘钥登录-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>

<!-- 启用nn故障自动转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!--权限设定避免因权限问题导致操作失败异常-->
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
1
2
3
4
5
6
7
8
9
10
<!-- vim mapred-site.xml -->
<!--job history单节点配置即可-->
<property>
<name>mapreduce.jobhistory.address</name>
<value>zyh:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>zyh:19888</value>
</property>

yarn HA

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
<!-- vim yarn-site.xml -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 启用resourcemanager ha -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>

<!-- yarn集群配置 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>mycluster</value>
</property>
<!--指定resourcemanager的逻辑列表-->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2,rm3</value>
</property>

<!-- ========== rm1 configs ========== -->
<!-- 指定rm1的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>zyh</value>
</property>
<!-- 指定rm1的web端地址 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>zyh:8088</value>
</property>
<!-- 指定rm1的内部通信地址 -->
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>zyh:8032</value>
</property>
<!-- 指定AM向rm1申请资源的地址 -->
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>zyh:8030</value>
</property>
<!-- 指定供NM连接的地址 -->
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>zyh:8031</value>
</property>

<!-- ========== rm2的配置 ========== -->
<!-- 指定rm2的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>why</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>why:8088</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>why:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>why:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>why:8031</value>
</property>
<!-- ========== rm3的配置 ========== -->
<!-- 指定rm2的主机名 -->
<property>
<name>yarn.resourcemanager.hostname.rm3</name>
<value>zzy</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm3</name>
<value>zzy:8088</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm3</name>
<value>zzy:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm3</name>
<value>zzy:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm3</name>
<value>zzy:8031</value>
</property>

<!-- 指定zookeeper集群的地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>why:2181,zyh:2181,zzy:2181</value>
</property>

<!-- 启用自动恢复 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的状态信息存储在zookeeper集群 -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>

<!-- 环境变量的继承 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 添加配置

# 启动 zookeeper 集群
zkServer.sh start # 3 奇数台:3 5 11
zkServer.sh status # 1 leader + 2 followers
# 启动 journalnode 集群
hdfs --daemon start journalnode # *4
#格式化zkfc
hdfs zkfc -formatZK # 每台namenode的机器都需要
#主NN节点格式化
hdfs namenode -format # 一台即可
#首次启动集群
start-all.sh
#从NN节点格式化和启动
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode # 单启动namenode
#查看服务

#启动job historyserver方式
mapred --deamon start historyserver # zyh
mapred --deamon stop historyserver # zyh

#查看 nn1和nn2的状态
hdfs haadmin -getServiceState nn1 # standby
hdfs haadmin -getServiceState nn2 # active
#手动激活 active, 无zookeeper时
hdfs haadmin -transitionToActive --forcemanual nn1 # 有active则拒绝操作

# 验证高可用
#关闭主节点(zyh)的namenode,查看why的namenode是否被激活(standby->active)
hdfs --daemon stop namenode

#重启主节点(zyh)的namenode,状态为standby
hdfs --daemon start namenode

# 【第二组心跳:nn1|nn2 =存储nn信息,监控nn状态=> zkcluster】

Kafka

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 配置
config/server.properties
#----------------------------------
broker.id=0 #集群多节点编号 0,1,2
delete.topic.enable=true #允许删除主题
num.partitions=3 #最佳配置为broker的数量
#超过一周或文件大小超过1个G,删除历史数据
log.retention.hours=168 #数据保留一周
log.segment.bytes=1073741824‬ #数据最大为4G
log.retention.check.interval.ms=300000 #多久检查一次根据规则是否删除数据
zookeeper.connect=why:2181,zzy:2181,zyh:2181 #zookeeper集群
#----------------------------------

# 分发(远程拷贝)至其他节点

# 群启zk和kafka服务

workers 和 masters省略

flink-conf.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
jobmanager.rpc.address: zyh
jobmanager.rpc.port: 6123
jobmanager.memory.process.size: 2048m # 内存不够这里调小点(下同)
taskmanager.host: zyh
taskmanager.memory.process.size: 4096m
taskmanager.numberOfTaskSlots: 3
parallelism.default: 3

high-availability.type: zookeeper
high-availability.storageDir: hdfs://mycluster/flink/ha/
high-availability.zookeeper.path.root: /flink
high-availability.cluster-id: /cluster_one
high-availability.zookeeper.quorum: zyh:2181,zzy:2181,why:2181

execution.checkpointing.interval: 30000
execution.checkpointing.externalized-checkpoint-retention: RETAIN_ON_CANCELLATION
execution.checkpointing.max-concurrent-checkpoints: 2
execution.checkpointing.min-pause: 500
execution.checkpointing.mode: EXACTLY_ONCE
execution.checkpointing.timeout: 600000
execution.checkpointing.tolerable-failed-checkpoints: 3

restart-strategy.type: fixed-delay
restart-strategy.fixed-delay.attempts: 3
restart-strategy.fixed-delay.delay: 10000

state.backend: filesystem
state.checkpoints.dir: hdfs://mycluster/flink-checkpoints
state.savepoints.dir: hdfs://mycluster/flink-savepoints
jobmanager.execution.failover-strategy: region

rest.port: 8081
rest.address: zyh

jobmanager.archive.fs.dir: hdfs://mycluster/logs/flink-job
historyserver.web.address: zyh
historyserver.web.port: 8082
historyserver.archive.fs.dir: hdfs://mycluster/logs/flink-job
historyserver.archive.fs.refresh-interval: 5000
1
2
3
cp $ZOOKEEPER_HOME/conf/zoo.cfg $FLINK_HOME/conf/
cp $HADOOP_HOME/etc/hadoop/hdfs-site.xml /opt/software/flink-1.17.0/conf/
cp $HADOOP_HOME/etc/hadoop/core-site.xml /opt/software/flink-1.17.0/conf/
1
2
3
# 报错Cannot create Hadoop Security Module because Hadoop cannot be found in the Classpath

# 解决:配置环境变量 export HADOOP_CLASSPATH=`hadoop classpath`

Mysql 8 安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
sudo yum update -y
sudo yum install wget -y
wget https://dev.mysql.com/get/mysql80-community-release-el7-7.noarch.rpm
sudo yum localinstall mysql80-community-release-el7-7.noarch.rpm

# 使用 --nogpgcheck 参数安装 MySQL 软件包,这样会跳过 GPG 密钥检查, 不推荐用于生产环境
sudo yum install mysql-community-server --nogpgcheck

sudo systemctl start mysqld
sudo systemctl enable mysqld
# 获取临时密码
sudo grep 'temporary password' /var/log/mysqld.log
# 连接
mysql -u root -p
# 临时乱设置个密码
ALTER USER 'root'@'localhost' IDENTIFIED BY '1e;YtoJP2kOd';
# 将密码等级设置到最低,并且最少位数6位 Djq110..
SET GLOBAL validate_password.policy = LOW;
SET GLOBAL validate_password.length = 6;
# 然后密码就可以设置为123456了
ALTER USER 'root'@'localhost' IDENTIFIED BY '123456';

create user root@'%' identified by 'password'password expire never;


ALTER USER 'root'@'%' IDENTIFIED BY 'Ljj315..';
# 刷新权限
FLUSH PRIVILEGES;

# 远程连接 创建用户 名称:root %:表示都可以连接
create user root@'%' identified by '123456'password expire never;

# 为用户授权,所有库所有权限,完成后刷新
grant all on *.* to root@'%';
# 刷新权限
FLUSH PRIVILEGES;

Hive

一台即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 0、准备工作(安装之前请确保 hadoop3.1.3 和 mysql8 已安装且服务已启动)
linux hive 安装压缩包 apache-hive-3.1.2-bin.tar.gz 上传至 /opt/download/second下
mysql8 链接驱动包 mysql-connector-j-8.0.33.jar 上传至 /opt/download/second下

# 1、解压并重命名
cd /opt/download
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/software/
mv /opt/software/apache-hive-3.1.2-bin/ /opt/software/hive-3.1.2
cd /opt/software/hive-3.1.2

# 2、环境变量并激活
vim /etc/profile.d/my.sh
#-----------------------------------------
# hive
export HIVE_HOME=/opt/software/hive312
export PATH=$PATH:$HIVE_HOME/bin
#-----------------------------------------
source /etc/profile

# 3、配置文件
mv conf/hive-default.xml.template conf/hive-default.xml
1
2
# 新建编辑hive-site.xml 
vim conf/hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<!--关闭客户端和hive2服务远程通信安全验证-->
<property>
<name>hive.metastore.sasl.enabled</name>
<value>false</value>
</property>
<!--关闭客户端和hive2代理用户身份验证,使用默认用户-->
<property>
<name>hive.server2.enable.doAs</name>
<value>false</value>
</property>
<!--配置hive2认证机制:NONE,NOSASL,LDAP,KERBEROS-->
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<!--metastore(元)数据HDFS目录-->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/hive312/warehouse</value>
</property>
<!--metastore(元)数据库类型-->
<property>
<name>hive.metastore.db.type</name>
<value>mysql</value>
</property>
<!--连接mysql字符串-->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://zyh:3306/hive312?createDatabaseIfNotExist=true</value>
</property>
<!--mysql连接驱动-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
</property>
<!--mysql连接账号-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<!--mysql本地连接密码-->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<!--关闭hive启动是是否对metadata的schema验证-->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!--提示当前库名-->
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
<!--查询输出显示列名-->
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
</configuration>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#4、拷贝mysql驱动
cp /opt/download/mysql-connector-j-8.0.33.jar lib/

#5、更新guava包和hadoop一致
ls lib/|grep guava
# guava-19.0.jar
rm -f lib/guava-19.0.jar
find /opt/software/hadoop-3.1.3/ -name guava*
#/opt/software/hadoop313/share/hadoop/common/lib/guava-27.0-jre.jar
#/opt/software/hadoop313/share/hadoop/hdfs/lib/guava-27.0-jre.jar
cp /opt/software/hadoop-3.1.3/share/hadoop/hdfs/lib/guava-27.0-jre.jar lib/

#7、初始化(确保 mysql8 已经开启远程链接用户并授权)
schematool -dbType mysql -initSchema

#8、hive启动模式
#首先启动元数据服务
nohup hive --service metastore 1>/dev/null 2>&1 &

#1、方法一 hive客户端
hive
#2、方法二 基于metastore和hiveserver2的beeline
#启动hiveserver2服务
nohup hive --service hiveserver2 1>/dev/null 2>&1 &
beeline -u jdbc:hive2://localhost:10000

##################
# HIVE UDF 3.1.2 #
##################
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
</dependency>

Spark

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 先安装 zookeeper 集群
vim spark-env.sh
----------------------------------------------------------------
export JAVA_HOME=/opt/software/jdk1.8.0_171
export HADOOP_CONF_DIR=/opt/software/hadoop-3.1.3/etc/hadoop

# 主机配置
# SPARK_MASTER_HOST=master01 注释掉
# SPARK_MASTER_PORT=7077 注释掉

# 默认 8080 容易被占用
SPARK_MASTER_WEBUI_PORT=9090
# 历史服务配置
SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=9091
-Dspark.history.fs.logDirectory=hdfs://zyh:8020/spark_event_log_dir
-Dspark.history.retainedApplications=30"
# 高可用配置
SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER
-Dspark.deploy.zookeeper.url=why,zyh,zzy
-Dspark.deploy.zookeeper.dir=/spark"
----------------------------------------------------------------------

workers
-----------
zzy
zyh
-----------
why/2/4 zyh/2/4 zzy/2/4 zjy/2/2
QuorumPeerMain QuorumPeerMain QuorumPeerMain
DataNode DataNode DataNode DataNode
NodeManager NodeManager NodeManager NodeManager
ResourceManager ResourceManager(x) ResourceManager
NameNode NameNode(x) NameNode
DFSZKFailoverController DFSZKFailoverController(x) DFSZKFailoverController
JournalNode JournalNode(x) JournalNode
Master Worker Worker
Kafka Kafka
StandaloneSession ClusterEntrypoint(x) StandaloneSession ClusterEntrypoint(x)
TaskManagerRunner(x) TaskManagerRunner(x) TaskManagerRunner(x) TaskManagerRunner(x)
RunJar RunJar(hive)
mysql 8
1
2
3
4
5
6
http://zzy:9870/		hadoop

http://zyh:9090/ spark Master
http://zyh:8081/ Worker

zyh mysql 3306 123456 hive

Hadoop集群配置

资源不足不用高可用,一台全部配置,其他三台只要datanode和nodemanager

免密环境变量同上

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
<!-- core-site.xml -->
<!-- 指定 NameNode 的地址 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://zzy:8020</value>
</property>
<!-- 指定 hadoop 数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/software/hadoop-3.1.3/data</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
<!-- hdfs-site.xml -->
<!-- NameNode web 端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>zzy:9870</value>
</property>
<!-- 2NameNode web 端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>zzy:9868</value>
</property>
<!-- 备份数量 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!-- 打开webhdfs -->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
<!-- yarn-site.xml -->
<!-- 指定 MR 走 shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定 ResourceManager 的地址-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>zzy</value>
</property>
<!-- 环境变量的继承 跑示例时要用到-->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<!-- 开启日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置日志聚集服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://zzy:19888/jobhistory/logs</value>
</property>
<!-- 设置日志保留时间为 7 天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
<!-- mapred-site.xml -->
<!-- 指定 MapReduce 程序运行在 Yarn 上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 历史服务器 web 端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>zzy:19888</value>
</property>

workers

1
2
3
node01
node02
node03
1
2
3
4
5
6
# 以下命令全在namenode那台执行
hadoop namenode -format

start-all.sh

mapred --daemon start historyserver
1
2
3
4
5
6
7
8
9
10
11
101.37.71.111 zyh		
118.31.67.112 why
101.37.161.4 zzy
101.37.77.203 zjy

http://101.37.161.4:9870/ hadoop

http://zyh:9090/ spark Master
http://zyh:8081/ Worker

zyh mysql 3306 123456 hive
服务器 why/2/4(118.31.67.112) zyh/2/4(101.37.71.111) zzy/2/4(101.37.161.4) zjy/2/2(101.37.77.203)
zookeeper QuorumPeerMain QuorumPeerMain QuorumPeerMain
hadoop DataNode DataNode DataNode DataNode
NodeManager NodeManager NodeManager NodeManager
ResourceManager8088
NameNode 9870
JobHistoryServer 19888
spark Master 9090 Worker8081
kafka Kafka Kafka
hive RunJar RunJar
mysql mysql 8(密码123456)
flink
1
2
3
4
5
6
7
8
sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--username root \
--password Ljj315.. \
--connect jdbc:mysql://leafdomain.cn/order_sys --table region_info \
--fields-terminated-by ',' \
--hive-import \
--hive-database order_sys \
--hive-table region_info

阿里云搭建
https://leaf-domain.gitee.io/2025/03/22/linux_env_install/阿里云配置/
作者
叶域
发布于
2025年3月22日
许可协议