Fork me on GitHub

Hadoop集群环境搭建

Hadoop如何搭建集群环境?


测试环境

1
2
3
4
5
6
7
8
9
10
测试环境:
Ubuntu 14.04 LTS x64
Hadoop:hadoop-2.7.1.tar.gz

hostname IP role
spark-master: 192.168.108.20 master & worker
spark-slave1: 192.168.108.21 worker
spark-slave2: 192.168.108.22 worker

!默认情况全部操作在root下进行

下载解压Hadoop

地址:Hadoop官方下载


#配置ssh免密码登录

请参考:Tachyon集群部署


修改目录权限

1
sudo chmod -R 775 hadoop-2.7.1/

修改yarn-env.sh配置文件

1
2
3
4
vim etc/hadoop/yarn-env.sh

export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_HOME}/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

修改hadoop-env.sh配置文件

1
2
3
4
5
vim etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr/lib/jvm/java
export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_HOME}/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

修改core-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
vim etc/hadoop/core-site.xml

<configuration>
<!-- 指定hdfs的nameservice为masters -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://masters</value>
</property>
<!-- 指定hadoop临时目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/jabo/software/hadoop-2.7.1/hadoop/tmp</value>
</property>
<!-- 指定zookeeper地址 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>spark-master:2181,spark-slave1:2181,spark-slave2:2181</value>
</property>
<property>
<name>hadoop.native.lib</name>
<value>false</value>
</property>
</configuration>

修改hdfs-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
vim etc/hadoop/hdfs-site.xml

<configuration>
<!--指定hdfs的nameservice为masters,需要和core-site.xml中的保持一致 -->
<property>
<name>dfs.nameservices</name>
<value>masters</value>
</property>
<!-- masters下面有两个NameNode,分别是Master,Slave1 -->
<property>
<name>dfs.ha.namenodes.masters</name>
<value>Master,Slave1</value>
</property>
<!-- Master的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.masters.Master</name>
<value>spark-master:9000</value>
</property>
<!-- Master的http通信地址 -->
<property>
<name>dfs.namenode.http-address.masters.Master</name>
<value>spark-master:50070</value>
</property>
<!-- Slave1的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.masters.Slave1</name>
<value>spark-slave1:9000</value>
</property>
<!-- Slave1的http通信地址 -->
<property>
<name>dfs.namenode.http-address.masters.Slave1</name>
<value>spark-slave1:50070</value>
</property>
<!-- 指定NameNode的元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://spark-master:8485;spark-slave1:8485;spark-slave2:8485/masters</value>
</property>
<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/home/jabo/software/hadoop-2.7.1/hadoop/journal</value>
</property>
<!-- 开启NameNode失败自动切换 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 配置失败自动切换实现方式 -->
<property>
<name>dfs.client.failover.proxy.provider.masters</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>~/.ssh/id_rsa</value>
</property>
</configuration>

修改slaves

1
2
3
4
5
vim etc/hadoop/slaves

spark-master
spark-slave1
spark-slave2

修改yarn-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
vim etc/hadoop/yarn-site.xml

<configuration>
<!-- 指定resourcemanager地址 -->
<!-- 开启RM高可靠 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定RM的cluster id -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>RM_HA_ID</value>
</property>
<!-- 指定RM的名字 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 分别指定RM的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>spark-master</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>spark-slave1</value>
</property>
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 指定zk集群地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>spark-master:2181,spark-slave1:2181,spark-slave2:2181</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>spark-master:8132</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>spark-slave1:8132</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>spark-master:8130</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>spark-slave1:8130</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>spark-master:8131</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>spark-slave1:8131</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>spark-master:8188</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>spark-slave1:8188</value>
</property>
</configuration>

修改mapred-site.xml

1
2
3
4
5
6
7
8
9
10
11
cp etc/hadoop/mapred-site.xml.template ./etc/hadoop/mapred-site.xml

vim etc/hadoop/mapred-site.xml

<configuration>
<!-- 指定mr框架为yarn方式 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

Hadoop目录分发

将Hadoop目录分别复制到其他主机


配置环境变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/**
* 在每台主机上进行环境变量配置
*/
sudo vim /etc/profile

export HADOOP_HOME=/home/jabo/software/hadoop-2.7.1
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOOME/sbin:$HADOOP_HOME/lib
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"

source /etc/profile

启动zookeeper集群

1
2
3
4
/**
* 启动每个节点
*/
zkServer.sh start

启动journalnode

1
2
3
4
5
./sbin/hadoop-daemons.sh start journalnode

spark-slave1: starting journalnode, logging to /home/jabo/software/hadoop-2.7.1/logs/hadoop-root-journalnode-spark-slave1.out
spark-slave2: starting journalnode, logging to /home/jabo/software/hadoop-2.7.1/logs/hadoop-root-journalnode-spark-slave2.out
spark-master: starting journalnode, logging to /home/jabo/software/hadoop-2.7.1/logs/hadoop-root-journalnode-spark-master.out

格式化HDFS

1
2
hadoop namenode -format
scp -r /home/jabo/software/hadoop-2.7.1/hadoop/tmp/ spark-slave1:/home/jabo/software/hadoop-2.7.1/hadoop/

格式化zk

1
hdfs zkfc -formatZK

启动HDFS

1
./sbin/start-dfs.sh

启动YARN

1
./sbin/start-yarn.sh

测试HDFS

1
2
3
4
hadoop fs -put /etc/profile /profile
hadoop fs -ls /

-rw-r--r-- 3 root supergroup 1928 2016-01-07 16:36 /profile

测试Yarn

1
2
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar wordcount /profile /out
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar pi 20 10

查看进程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
root@spark-master:/home/jabo/software/hadoop-2.7.1# jps
4196 NodeManager
4236 Jps
3934 DFSZKFailoverController
4055 ResourceManager
3534 DataNode
3396 NameNode
2879 QuorumPeerMain
3077 JournalNode


root@spark-slave1:/home/jabo# jps
3811 Jps
3357 DataNode
3106 JournalNode
3697 NodeManager
3256 NameNode
3583 DFSZKFailoverController
2945 QuorumPeerMain


root@spark-slave2:/home/jabo# jps
3197 DataNode
3087 JournalNode
3390 NodeManager
3498 Jps
2924 QuorumPeerMain

统计访问

1
2
3
4
http://spark-master:50070
NameNode 'Master:9000' (active)
http://spark-slave1:50070
NameNode 'Slave1:9000' (standby)

Hadoop集群情况

1
http://spark-master:8188

转载请注明出处


Thank you for your support.