-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDockerfile
147 lines (109 loc) · 5.01 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
FROM apache/hadoop:3.3.6 as hadoop_image
FROM ubuntu:24.04
# The steps here were partially determined by looking at
# docker history --no-trunc apache/hadoop:3.3.6
ENV HADOOP_VER=3.3.6
###
# Install python, java & other necessary binaries
###
# ubuntu 24.04 LTS noble only has python3.12
# the autoremove step is to remove the many dependencies, including python3.12, of SPC
RUN apt update -y \
&& apt install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt update -y \
&& apt install -y python3.11 openjdk-8-jre curl \
&& apt autoremove -y --purge software-properties-common python3.12 \
&& rm -rf /var/lib/apt/lists/*
RUN ln -s /usr/bin/python3.11 /usr/bin/python3
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/
###
# Copy startup scripts from the hadoop image
###
# No idea where the source for this stuff lives
COPY --from=hadoop_image /opt/envtoconf.py /opt/envtoconf.py
COPY --from=hadoop_image /opt/transformation.py /opt/transformation.py
COPY --from=hadoop_image /opt/starter.sh /opt/starter.sh
###
# Move sudo command from /opt/starter.sh here
###
# Comment from script: To avoid docker volume permission problems
RUN mkdir -p /data && chmod o+rwx /data
# remove the line from the script. There are other sudos in `if` blocks, cross that bridge later
RUN sed -i 's$sudo chmod o+rwx /data$# sudo chmod o+rwx /data$' /opt/starter.sh
###
# Set up hadoop user
###
RUN groupadd --gid 1001 hadoop \
&& useradd --uid 1001 hadoop --gid 1001 --home /opt/hadoop \
&& chown -R hadoop:hadoop /opt
USER hadoop
###
# Install hadoop
###
WORKDIR /opt
RUN curl -LSs -o hadoop.tgz https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VER/hadoop-$HADOOP_VER.tar.gz \
&& tar zxf hadoop.tgz \
&& rm hadoop.tgz \
&& mv hadoop* hadoop \
&& rm -r /opt/hadoop/share/doc # 0.5GB of docs
USER root
RUN mkdir -p /var/log/hadoop && chmod 1777 /var/log/hadoop
USER hadoop
ENV HADOOP_HOME=/opt/hadoop
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
ENV HADOOP_LOG_DIR=/var/log/hadoop
ENV PATH=$PATH:/opt/hadoop/bin
###
# Hack the environmental config system because it's annoying
###
# This is pretty fragile. If the configuration starts breaking this is one place to start debugging
# The reason for this line is that hadoop configuration by env var is the easiest way to handle the
# config, since it means you don't need configuration templates that need to be updated
# every time you want to add a new config var. However, the env var name has to include the
# file name, which contains a hyphen, and key name, which usually includes periods. Both are
# illegal in most shells, so `export` commands fail. Interestingly setting those same env vars
# in a Dockerfile or docker compose works and they end up in the environment, punctuation and all.
# This hack removes some code that blanks the raw configuration files when the script is run
# so that we can preload them with the environment we want.
RUN sed -i -z 's#if name not in self\.configurables\.keys.*myfile.write("")##' /opt/envtoconf.py
RUN sed -i 's#/usr/bin/python#/usr/bin/python3#' /opt/envtoconf.py
# This is a hack to get the hadoop environment configuration code to run on the raw configuration
# files regardless of whether there's a config var set that triggers that file
ENV YARN-SITE.XML_fakekey=hack_to_get_config_system_to_process_the_raw_config_file
ENV CORE-SITE.XML_fakekey=hack_to_get_config_system_to_process_the_raw_config_file
ENV HDFS-SITE.XML_fakekey=hack_to_get_config_system_to_process_the_raw_config_file
###
# Enable the fair scheduler
###
# There are lots of config options, see
# https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/FairScheduler.html
ENV YARN-SITE.XML_yarn.resourcemanager.scheduler.class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler
ENV YARN-SITE.XML_yarn.scheduler.fair.allocation.file=/opt/hadoop/fair-scheduler.xml
COPY ./conf/yarn/fair-scheduler.xml /opt/hadoop/fair-scheduler.xml
###
# Enable s3
###
ENV HADOOP_OPTIONAL_TOOLS=hadoop-aws
###
# Enable hardware capability detection (memory, CPU)
###
ENV YARN-SITE.XML_yarn.nodemanager.resource.detect-hardware-capabilities=true
ENV YARN-SITE.XML_yarn.nodemanager.resource.count-logical-processors-as-cores=true
# Not sure if yarn.nodemanager.resource.pcores-vcores-multiplier needs to be set, seems like
# the yarn code should be able to figure it out
# Currently resource limits are used only to determine whether to add a job to a node based on the
# requested resources. Once the job is running it might take up more resources than requested.
# If we find this to be a problem we can look into YARN's ability to kill jobs overusing resources:
# https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/NodeManagerCGroupsMemory.html
###
# Finish the build
###
COPY ./scripts/ /opt/scripts/
USER root
RUN chmod a+x /opt/scripts/*.sh
# Set permissions for /tmp as it will be mounted to external storage in Rancher
RUN chmod -R 1777 /tmp
USER hadoop
WORKDIR /opt/hadoop
ENTRYPOINT ["/opt/scripts/entrypoint.sh"]