From adabd77060f3db219b04f6962d498fae9c4ec7ce Mon Sep 17 00:00:00 2001 From: shankgan Date: Sat, 30 Jan 2021 18:02:02 -0800 Subject: [PATCH 01/11] Changing Dockerfile --- docker/Dockerfile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 5a69d1ff..11a10c62 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,13 @@ FROM tensorflow/tensorflow:nightly -COPY mnist.py / +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY . /app/ + ENTRYPOINT ["python", "/mnist.py"] From f5afa0dd4704e5ee0abee8cebfbbe3365e7b8fef Mon Sep 17 00:00:00 2001 From: shankgan Date: Sat, 30 Jan 2021 18:03:09 -0800 Subject: [PATCH 02/11] Adding Example for MultiWorkerMirorredStrategy that trains a CNN model on the MNIST dataset --- docker/keras_mnist.py | 97 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 docker/keras_mnist.py diff --git a/docker/keras_mnist.py b/docker/keras_mnist.py new file mode 100644 index 00000000..2daea167 --- /dev/null +++ b/docker/keras_mnist.py @@ -0,0 +1,97 @@ +from __future__ import print_function + +import math +import os +import tensorflow as tf +import numpy as np +import json + +""" +This code serves as an example of using Tensorflow 2.0 Keras API to build and train a CNN model on the +MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here +https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy. +This code is very similar to the example provided here +https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras +Assumptions: + 1) The code assumes that the cluster configurations needed for the TF distribute strategy is available through the + TF_CONFIG environment variable. See the link provided above for details + 2) The model is checkpointed and saved in /pvcmnt by the chief worker process. All other worker processes checkpoint + their code in the /tmp directory +""" + + +# Used to run example using CPU only. Untested on GPU +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + +# Model save directory +MAIN_MODEL_PATH = '/pvcmnt' + +def _is_chief(task_type, task_id): + # If `task_type` is None, this may be operating as single worker, which works + # effectively as chief. + return task_type is None or task_type == 'chief' or ( + task_type == 'worker' and task_id == 0) + +def _get_temp_dir(task_id): + base_dirpath = 'workertemp_' + str(task_id) + temp_dir = os.path.join("/tmp", base_dirpath) + os.makedirs(temp_dir) + return temp_dir + +def write_filepath(strategy): + task_type, task_id = strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id + if not _is_chief(task_type, task_id): + checkpoint_dir = _get_temp_dir(task_id) + else: + base_dirpath = 'workertemp_' + str(task_id) + checkpoint_dir = os.path.join(MAIN_MODEL_PATH, base_dirpath) + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + return checkpoint_dir + +def mnist_dataset(batch_size): + (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() + # The `x` arrays are in uint8 and have values in the range [0, 255]. + # You need to convert them to float32 with values in the range [0, 1] + x_train = x_train / np.float32(255) + y_train = y_train.astype(np.int64) + train_dataset = tf.data.Dataset.from_tensor_slices( + (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) + return train_dataset + +def build_and_compile_cnn_model(): + model = tf.keras.Sequential([ + tf.keras.Input(shape=(28, 28)), + tf.keras.layers.Reshape(target_shape=(28, 28, 1)), + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dense(10) + ]) + model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), + metrics=['accuracy']) + return model + +def main(): + per_worker_batch_size = 64 + tf_config = json.loads(os.environ['TF_CONFIG']) + num_workers = len(tf_config['cluster']['worker']) + strategy = tf.distribute.MultiWorkerMirroredStrategy() + + global_batch_size = per_worker_batch_size * num_workers + multi_worker_dataset = mnist_dataset(global_batch_size) + + # missing needs to be fixed + # multi_worker_dataset = strategy.distribute_datasets_from_function(mnist_dataset(global_batch_size)) + + callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=write_filepath(strategy))] + with strategy.scope(): + multi_worker_model = build_and_compile_cnn_model() + multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70, + callbacks=callbacks) + multi_worker_model.save(filepath=write_filepath(strategy)) + +if __name__=="__main__": + main() \ No newline at end of file From 541b7bc3c0dddfebfc577ce405cfc578c6c08923 Mon Sep 17 00:00:00 2001 From: shankgan Date: Sat, 30 Jan 2021 18:55:17 -0800 Subject: [PATCH 03/11] Adding Jinja2 template for MultiWorkerMirroredExample --- kubernetes/MultiWorkerMirroredTemplate.jinja | 111 +++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 kubernetes/MultiWorkerMirroredTemplate.jinja diff --git a/kubernetes/MultiWorkerMirroredTemplate.jinja b/kubernetes/MultiWorkerMirroredTemplate.jinja new file mode 100644 index 00000000..d1a78cf4 --- /dev/null +++ b/kubernetes/MultiWorkerMirroredTemplate.jinja @@ -0,0 +1,111 @@ +{%- set name = "tftest" -%} +{%- set image = "image-name" -%} +{%- set worker_replicas = 2 -%} +{%- set script = "keras_mnist.py" -%} +{%- set model_checkpoint_dir = "/pvcmnt" -%} +{%- set data_dir = "" -%} +{%- set pvc_name = "pvc-demo" -%} +{%- set port = 5000 -%} +{%- set create_pvc = True -%} +{%- set create_volume_inspector = True -%} + + +{%- macro worker_hosts() -%} + {%- for i in range(worker_replicas) -%} + {%- if not loop.first -%},{%- endif -%} + "{{ name }}-worker-{{ i }}:{{ port }}" + {%- endfor -%} +{%- endmacro -%} + +{%- for i in range(worker_replicas) -%} +kind: Service +apiVersion: v1 +metadata: + name: {{ name }}-worker-{{ i }} +spec: + selector: + name: {{ name }} + job: worker + task: "{{ i }}" + ports: + - port: {{ port }} +--- +kind: Job +apiVersion: batch/v1 +metadata: + name: {{ name }}-worker-{{ i }} +spec: + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + name: {{ name }} + job: worker + task: "{{ i }}" + spec: + containers: + - name: tensorflow + image: {{ image }} + ports: + - containerPort: {{ port }} + command: + - "python" + - "{{ script }}" + env: + - name: TF_CONFIG + value: '{"cluster": {"worker": [{{ worker_hosts() }}]}, "task": {"type": "worker", "index": {{ i }}}}' + args: + - "--data_dir={{ data_dir }}" + - "--model_checkpoint_dir={{ model_checkpoint_dir }}" + restartPolicy: Never +{% if i == 0 %} + volumeMounts: + - mountPath: "{{ model_checkpoint_dir }}" + name: pvc-mount + volumes: + - name: pvc-mount + persistentVolumeClaim: + claimName: {{ pvc_name }} +{% endif %}--- +{% endfor %} +{% if create_pvc %} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ pvc_name }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10G +--- +{% endif %} +{% if create_volume_inspector %} +kind: Pod +apiVersion: v1 +metadata: + name: volume-inspector +spec: + volumes: + - name: volume-to-inspect + persistentVolumeClaim: + claimName: {{ pvc_name }} + containers: + - name: debugger + image: busybox + command: ['sleep', '3600'] + volumeMounts: + - mountPath: {{ model_checkpoint_dir }} + name: volume-to-inspect + resources: + limits: + memory: 512Mi + cpu: "1" +--- +{% endif %} + + + + + From bdb3b2b79ebc98f93f5f999812058448ead23afd Mon Sep 17 00:00:00 2001 From: shankgan Date: Sun, 31 Jan 2021 12:48:04 -0800 Subject: [PATCH 04/11] Adding Custom Training loop Example --- docker/custom_training_mnist.py | 170 ++++++++++++++++++++++++++++++++ docker/keras_mnist.py | 41 +++++--- 2 files changed, 197 insertions(+), 14 deletions(-) create mode 100644 docker/custom_training_mnist.py diff --git a/docker/custom_training_mnist.py b/docker/custom_training_mnist.py new file mode 100644 index 00000000..50922550 --- /dev/null +++ b/docker/custom_training_mnist.py @@ -0,0 +1,170 @@ +# ============================================================================== +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# This code serves as an example of using Tensorflow 2.0 to build and train a CNN model on the +# Fashion MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here +# https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy +# using a custom training loop. This code is very similar to the example provided here +# https://www.tensorflow.org/tutorials/distribute/custom_training +# Assumptions: +# 1) The code assumes that the cluster configuration needed for the TF distribute strategy is available through the +# TF_CONFIG environment variable. See the link provided above for details +# 2) The model is checkpointed and saved in /pvcmnt by the chief worker process. + +import tensorflow as tf +import numpy as np +import os + +# Used to run example using CPU only. Untested on GPU +os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +MAIN_MODEL_PATH = '/pvcmnt' + +EPOCHS = 10 +BATCH_SIZE_PER_REPLICA = 64 +GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA + +def _is_chief(task_type, task_id): + # If `task_type` is None, this may be operating as single worker, which works + # effectively as chief. + return task_type is None or task_type == 'chief' or ( + task_type == 'worker' and task_id == 0) + +def _get_temp_dir(task_id): + base_dirpath = 'workertemp_' + str(task_id) + temp_dir = os.path.join("/tmp", base_dirpath) + os.makedirs(temp_dir) + return temp_dir + +def write_filepath(strategy): + task_type, task_id = strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id + if not _is_chief(task_type, task_id): + checkpoint_dir = _get_temp_dir(task_id) + else: + base_dirpath = 'workertemp_' + str(task_id) + checkpoint_dir = os.path.join(MAIN_MODEL_PATH, base_dirpath) + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + return checkpoint_dir + +def create_model(): + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Conv2D(64, 3, activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(64, activation='relu'), + tf.keras.layers.Dense(10) + ]) + return model + +def get_dist_data_set(strategy, batch_size): + fashion_mnist = tf.keras.datasets.fashion_mnist + (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() + # Adding a dimension to the array -> new shape == (28, 28, 1) + # We are doing this because the first layer in our model is a convolutional + # layer and it requires a 4D input (batch_size, height, width, channels). + # batch_size dimension will be added later on. + train_images = train_images[..., None] + test_images = test_images[..., None] + # Getting the images in [0, 1] range. + train_images = train_images / np.float32(255) + test_images = test_images / np.float32(255) + train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(batch_size) + test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(batch_size) + train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset) + test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset) + return train_dist_dataset, test_dist_dataset + +def main(): + global GLOBAL_BATCH_SIZE + strategy = tf.distribute.MultiWorkerMirroredStrategy() + GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + train_dist_dataset, test_dist_dataset = get_dist_data_set(strategy, GLOBAL_BATCH_SIZE) + checkpoint_pfx = write_filepath(strategy) + with strategy.scope(): + model = create_model() + optimizer = tf.keras.optimizers.Adam() + checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) + loss_object = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, + reduction=tf.keras.losses.Reduction.NONE) + test_loss = tf.keras.metrics.Mean(name='test_loss') + train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( + name='train_accuracy') + test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( + name='test_accuracy') + + def compute_loss(labels, predictions): + per_example_loss = loss_object(labels, predictions) + return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE) + + def test_step(inputs): + images, labels = inputs + predictions = model(images, training=False) + t_loss = loss_object(labels, predictions) + test_loss.update_state(t_loss) + test_accuracy.update_state(labels, predictions) + + def train_step(inputs): + images, labels = inputs + with tf.GradientTape() as tape: + predictions = model(images, training=True) + loss = compute_loss(labels, predictions) + gradients = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(gradients, model.trainable_variables)) + train_accuracy.update_state(labels, predictions) + return loss + + # `run` replicates the provided computation and runs it + # with the distributed input. + @tf.function + def distributed_train_step(dataset_inputs): + per_replica_losses = strategy.run(train_step, args=(dataset_inputs,)) + return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, + axis=None) + + @tf.function + def distributed_test_step(dataset_inputs): + return strategy.run(test_step, args=(dataset_inputs,)) + + for epoch in range(EPOCHS): + # TRAIN LOOP + total_loss = 0.0 + num_batches = 0 + for x in train_dist_dataset: + total_loss += distributed_train_step(x) + num_batches += 1 + train_loss = total_loss / num_batches + + # TEST LOOP + for x in test_dist_dataset: + distributed_test_step(x) + if epoch % 2 == 0: + checkpoint.save(checkpoint_pfx) + + template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, " + "Test Accuracy: {}") + print (template.format(epoch+1, train_loss, + train_accuracy.result()*100, test_loss.result(), + test_accuracy.result()*100)) + + test_loss.reset_states() + train_accuracy.reset_states() + test_accuracy.reset_states() + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/docker/keras_mnist.py b/docker/keras_mnist.py index 2daea167..e3910e70 100644 --- a/docker/keras_mnist.py +++ b/docker/keras_mnist.py @@ -1,3 +1,30 @@ +# ============================================================================== +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# This code serves as an example of using Tensorflow 2.0 Keras API to build and train a CNN model on the +# MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here +# https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy. +# This code is very similar to the example provided here +# https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras +# Assumptions: +# 1) The code assumes that the cluster configuration needed for the TF distribute strategy is available through the +# TF_CONFIG environment variable. See the link provided above for details +# 2) The model is checkpointed and saved in /pvcmnt by the chief worker process. + + from __future__ import print_function import math @@ -6,20 +33,6 @@ import numpy as np import json -""" -This code serves as an example of using Tensorflow 2.0 Keras API to build and train a CNN model on the -MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here -https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy. -This code is very similar to the example provided here -https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras -Assumptions: - 1) The code assumes that the cluster configurations needed for the TF distribute strategy is available through the - TF_CONFIG environment variable. See the link provided above for details - 2) The model is checkpointed and saved in /pvcmnt by the chief worker process. All other worker processes checkpoint - their code in the /tmp directory -""" - - # Used to run example using CPU only. Untested on GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" From 4842fbb8dc30639b0c57504fec0adcc721c2f190 Mon Sep 17 00:00:00 2001 From: shankgan Date: Sun, 31 Jan 2021 13:35:04 -0800 Subject: [PATCH 05/11] Amending Documentation --- README.md | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 099e5185..f17902fe 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ request. - [docker](docker) - Docker configuration for running TensorFlow on cluster managers. - [kubeflow](https://github.com/kubeflow/kubeflow) - A Kubernetes native platform for ML - * A K8s custom resource for running distributed [TensorFlow jobs](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#submitting-a-tensorflow-training-job) + * A K8s custom resource for running distributed [TensorFlow jobs](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#submitting-a-tensorflow-training-job) * Jupyter images for different versions of TensorFlow * [TFServing](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#serve-a-model-using-tensorflow-serving) Docker images and K8s templates - [kubernetes](kubernetes) - Templates for running distributed TensorFlow on @@ -26,12 +26,14 @@ request. ## Distributed TensorFlow +### Tensorflow 1 + See the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) documentation for a description of how it works. The examples in this repository focus on the most common form of distributed training: between-graph replication with asynchronous updates. -### Common Setup for distributed training +#### Common Setup for distributed training Every distributed training program has some common setup. First, define flags so that the worker knows about other workers and knows what role it plays in @@ -73,7 +75,8 @@ if FLAGS.job_name == "ps": Afterwards, your code varies depending on the form of distributed training you intend on doing. The most common form is between-graph replication. -### Between-graph Replication +#### Between-graph Replication + In this mode, each worker separately constructs the exact same graph. Each worker then runs the graph in isolation, only sharing gradients with the @@ -95,6 +98,27 @@ with tf.device(tf.train.replica_device_setter( # Run the TensorFlow graph. ``` +### Tensorflow 2 + +For distributed training, the tensorflow server is implicitly started. +The main configuration required by the tensorflow libraries is the cluster and local process configuration +that can be passed as an environment variable. +Refer to [Distributed TensorFlow Concepts](https://www.tensorflow.org/guide/distributed_training) for concepts. +Refer to [Distributed TensorFlow Examples](https://www.tensorflow.org/tutorials/distribute/keras) for examples. + +#### Sample TF_CONFIG cluster configuration for distributed training + +```python +os.environ["TF_CONFIG"] = json.dumps({ + "cluster": { + "worker": ["host1:port", "host2:port", "host3:port"], # Worker IP/Port locations + "ps": ["host4:port", "host5:port"], # Parameter Server IP/Port Locations + "chief": ["host6:port"] # Chief worker location + }, + "task": {"type": "worker", "index": 1} # Current Process configuration +}) +``` + ### Requirements To Run the Examples To run our examples, [Jinja templates](http://jinja.pocoo.org/) must be installed: From 5233d60b90ee887023fa7b13daffe7af78beb39c Mon Sep 17 00:00:00 2001 From: shankgan Date: Sun, 31 Jan 2021 13:47:59 -0800 Subject: [PATCH 06/11] bug fix --- docker/custom_training_mnist.py | 2 +- docker/keras_mnist.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/custom_training_mnist.py b/docker/custom_training_mnist.py index 50922550..fc4f4295 100644 --- a/docker/custom_training_mnist.py +++ b/docker/custom_training_mnist.py @@ -167,4 +167,4 @@ def distributed_test_step(dataset_inputs): test_accuracy.reset_states() if __name__=="__main__": - main() \ No newline at end of file + main() diff --git a/docker/keras_mnist.py b/docker/keras_mnist.py index e3910e70..9e2e5e9c 100644 --- a/docker/keras_mnist.py +++ b/docker/keras_mnist.py @@ -107,4 +107,4 @@ def main(): multi_worker_model.save(filepath=write_filepath(strategy)) if __name__=="__main__": - main() \ No newline at end of file + main() From 7269d7b91c70f30108541cde2ac7cd4d9e9181f9 Mon Sep 17 00:00:00 2001 From: shankgan Date: Fri, 5 Feb 2021 16:05:28 -0800 Subject: [PATCH 07/11] Adding more documentation --- README.md | 43 +++---- docker/Dockerfile.hdfs | 4 +- docker/README.md | 23 +++- kubernetes/MultiWorkerMirroredTemplate.jinja | 20 ++-- kubernetes/README.md | 114 ++++++++++++++++++- 5 files changed, 168 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index f17902fe..2e3717c0 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,28 @@ request. ## Distributed TensorFlow +### Tensorflow 2 + +For distributed training, the tensorflow server is implicitly started. +The main configuration required by the tensorflow libraries is the cluster and local process configuration +that can be passed as an environment variable. +Refer to [Distributed TensorFlow Concepts](https://www.tensorflow.org/guide/distributed_training) for concepts. +Refer to [Distributed TensorFlow Examples](https://www.tensorflow.org/tutorials/distribute/keras) for examples. + +#### Sample TF_CONFIG cluster configuration for distributed training + +```python +os.environ["TF_CONFIG"] = json.dumps({ + "cluster": { + "worker": ["host1:port", "host2:port", "host3:port"], # Worker IP/Port locations + "ps": ["host4:port", "host5:port"], # Parameter Server IP/Port Locations + "chief": ["host6:port"] # Chief worker location + }, + "task": {"type": "worker", "index": 1} # Current Process configuration +}) +``` + + ### Tensorflow 1 See the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) @@ -98,27 +120,6 @@ with tf.device(tf.train.replica_device_setter( # Run the TensorFlow graph. ``` -### Tensorflow 2 - -For distributed training, the tensorflow server is implicitly started. -The main configuration required by the tensorflow libraries is the cluster and local process configuration -that can be passed as an environment variable. -Refer to [Distributed TensorFlow Concepts](https://www.tensorflow.org/guide/distributed_training) for concepts. -Refer to [Distributed TensorFlow Examples](https://www.tensorflow.org/tutorials/distribute/keras) for examples. - -#### Sample TF_CONFIG cluster configuration for distributed training - -```python -os.environ["TF_CONFIG"] = json.dumps({ - "cluster": { - "worker": ["host1:port", "host2:port", "host3:port"], # Worker IP/Port locations - "ps": ["host4:port", "host5:port"], # Parameter Server IP/Port Locations - "chief": ["host6:port"] # Chief worker location - }, - "task": {"type": "worker", "index": 1} # Current Process configuration -}) -``` - ### Requirements To Run the Examples To run our examples, [Jinja templates](http://jinja.pocoo.org/) must be installed: diff --git a/docker/Dockerfile.hdfs b/docker/Dockerfile.hdfs index 5357b682..6e19fed6 100644 --- a/docker/Dockerfile.hdfs +++ b/docker/Dockerfile.hdfs @@ -23,6 +23,8 @@ ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server ENV CLASSPATH /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/httpcore-4.2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-configuration-1.6.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-xc-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/gson-2.2.4.jar:/usr/local/hadoop/share/hadoop/common/lib/snappy-java-1.0.4.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jaxb-api-2.2.2.jar:/usr/local/hadoop/share/hadoop/common/lib/paranamer-2.3.jar:/usr/local/hadoop/share/hadoop/common/lib/apacheds-kerberos-codec-2.0.0-M15.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/hadoop-annotations-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/lib/api-asn1-api-1.0.0-M20.jar:/usr/local/hadoop/share/hadoop/common/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/common/lib/java-xmlbuilder-0.4.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/common/lib/slf4j-api-1.7.10.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/common/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/jsp-api-2.1.jar:/usr/local/hadoop/share/hadoop/common/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-recipes-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-beanutils-1.7.0.jar:/usr/local/hadoop/share/hadoop/common/lib/mockito-all-1.8.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-client-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-json-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-jaxrs-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-httpclient-3.1.jar:/usr/local/hadoop/share/hadoop/common/lib/zookeeper-3.4.6.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-framework-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-net-3.1.jar:/usr/local/hadoop/share/hadoop/common/lib/xmlenc-0.52.jar:/usr/local/hadoop/share/hadoop/common/lib/avro-1.7.4.jar:/usr/local/hadoop/share/hadoop/common/lib/jettison-1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/api-util-1.0.0-M20.jar:/usr/local/hadoop/share/hadoop/common/lib/activation-1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/common/lib/stax-api-1.0-2.jar:/usr/local/hadoop/share/hadoop/common/lib/apacheds-i18n-2.0.0-M15.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/hadoop-auth-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-beanutils-core-1.8.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-collections-3.2.2.jar:/usr/local/hadoop/share/hadoop/common/lib/junit-4.11.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-digester-1.8.jar:/usr/local/hadoop/share/hadoop/common/lib/hamcrest-core-1.3.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar:/usr/local/hadoop/share/hadoop/common/lib/jsch-0.1.42.jar:/usr/local/hadoop/share/hadoop/common/lib/jaxb-impl-2.2.3-1.jar:/usr/local/hadoop/share/hadoop/common/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/common/lib/httpclient-4.2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/common/lib/htrace-core-3.1.0-incubating.jar:/usr/local/hadoop/share/hadoop/common/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/common/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-math3-3.1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jets3t-0.9.0.jar:/usr/local/hadoop/share/hadoop/common/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/common/hadoop-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/hadoop-common-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/common/hadoop-nfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-daemon-1.0.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xercesImpl-2.9.1.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/netty-all-4.0.23.Final.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xmlenc-0.52.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/htrace-core-3.1.0-incubating.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xml-apis-1.3.04.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-nfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-xc-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jaxb-api-2.2.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-client-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/yarn/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/aopalliance-1.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/yarn/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/javax.inject-1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-json-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-jaxrs-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-guice-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jettison-1.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6-tests.jar:/usr/local/hadoop/share/hadoop/yarn/lib/activation-1.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/yarn/lib/stax-api-1.0-2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guice-3.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guice-servlet-3.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-collections-3.2.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jaxb-impl-2.2.3-1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/yarn/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-tests-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-api-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-nodemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-applicationhistoryservice-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-registry-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-sharedcachemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-client-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-applications-unmanaged-am-launcher-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-resourcemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-applications-distributedshell-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-web-proxy-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/snappy-java-1.0.4.1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/paranamer-2.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/hadoop-annotations-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/aopalliance-1.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/javax.inject-1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-guice-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/avro-1.7.4.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/guice-3.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/guice-servlet-3.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/junit-4.11.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/hamcrest-core-1.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-plugins-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-app-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-shuffle-2.7.3.jar:/contrib/capacity-scheduler/*.jar -COPY mnist.py / +WORKDIR /app + +COPY . /app/ ENTRYPOINT ["python", "/mnist.py"] diff --git a/docker/README.md b/docker/README.md index 6379a19a..427cc02f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,11 +4,16 @@ This directory contains example Dockerfiles to run TensorFlow on cluster managers. - [Dockerfile](Dockerfile) is the most basic example, which just adds a Python - training program on top of the tensorflow/tensorflow Docker image. + training program on top of the tensorflow/tensorflow Docker image. All training programs + in this directory will be copied into docker image - [Dockerfile.hdfs](Dockerfile.hdfs) installs Hadoop libraries and sets the appropriate environment variables to enable reading from HDFS. - [mnist.py](mnist.py) demonstrates the programmatic setup for distributed - TensorFlow training. + TensorFlow training using the tensorflow 1.x API. +- [keras_mnist.py](mnist.py) demonstrates how to train an MNIST classifier using + [tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). +- [custom_training_mnist.py](mnist.py) demonstrates how to train a fashion MNIST classifier using + [tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). ## Best Practices @@ -46,3 +51,17 @@ program to convert mnist data to TFRecords. When running distributed TensorFlow, you should upload the converted data to a common location on distributed storage, such as GCS or HDFS. + +## Running the keras_mnist.py example + +The [keras_mnist.py](keras_mnist.py) example demonstrates how to train an MNIST classifier using +[tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). +The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. +It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster + +## Running the custom_training_mnist.py example + +The [custom_training_mnist.py](mnist.py) example demonstrates how to train a fashion MNIST classifier using +[tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). +The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. +It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster. diff --git a/kubernetes/MultiWorkerMirroredTemplate.jinja b/kubernetes/MultiWorkerMirroredTemplate.jinja index d1a78cf4..f0db6d10 100644 --- a/kubernetes/MultiWorkerMirroredTemplate.jinja +++ b/kubernetes/MultiWorkerMirroredTemplate.jinja @@ -1,15 +1,16 @@ -{%- set name = "tftest" -%} {%- set image = "image-name" -%} {%- set worker_replicas = 2 -%} {%- set script = "keras_mnist.py" -%} {%- set model_checkpoint_dir = "/pvcmnt" -%} -{%- set data_dir = "" -%} -{%- set pvc_name = "pvc-demo" -%} +{%- set checkpoint_pvc_name = "pvc-demo" -%} {%- set port = 5000 -%} -{%- set create_pvc = True -%} +{%- set create_pvc_checkpoint = True -%} {%- set create_volume_inspector = True -%} +{%- set deploy = False -%} +{% if deploy %} + {%- macro worker_hosts() -%} {%- for i in range(worker_replicas) -%} {%- if not loop.first -%},{%- endif -%} @@ -55,7 +56,6 @@ spec: - name: TF_CONFIG value: '{"cluster": {"worker": [{{ worker_hosts() }}]}, "task": {"type": "worker", "index": {{ i }}}}' args: - - "--data_dir={{ data_dir }}" - "--model_checkpoint_dir={{ model_checkpoint_dir }}" restartPolicy: Never {% if i == 0 %} @@ -65,14 +65,16 @@ spec: volumes: - name: pvc-mount persistentVolumeClaim: - claimName: {{ pvc_name }} + claimName: {{ checkpoint_pvc_name }} {% endif %}--- {% endfor %} -{% if create_pvc %} + +{% endif %} +{% if create_pvc_checkpoint %} apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: {{ pvc_name }} + name: {{ checkpoint_pvc_name }} spec: accessModes: - ReadWriteOnce @@ -90,7 +92,7 @@ spec: volumes: - name: volume-to-inspect persistentVolumeClaim: - claimName: {{ pvc_name }} + claimName: {{ checkpoint_pvc_name }} containers: - name: debugger image: busybox diff --git a/kubernetes/README.md b/kubernetes/README.md index 7c5af8d5..2d678d32 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -3,7 +3,9 @@ This directory contains a template for running distributed TensorFlow on Kubernetes. -## Prerequisites +## Steps to train [mnist.py](../docker/mnist.py) + +### Prerequisites 1. You must be running Kubernetes 1.3 or above. If you are running an earlier version, the DNS addon must be enabled. See the @@ -12,7 +14,7 @@ Kubernetes. 2. [Jinja templates](http://jinja.pocoo.org/) must be installed. -## Steps to Run the job +### Steps to Run the job 1. Follow the instructions for creating the training program in the parent [README](../README.md). @@ -43,7 +45,7 @@ write to Google Cloud Storage. See the Google Cloud Storage section below. python render_template.py myjob.template.jinja | kubectl delete -f - ``` -## Google Cloud Storage +### Google Cloud Storage To support reading and writing to Google Cloud Storage, you need to set up a [Kubernetes secret](http://kubernetes.io/docs/user-guide/secrets/) with the @@ -63,3 +65,109 @@ credentials. 3. In your template, set `credential_secret_name` to `"credential"` (as specified above) and `credential_secret_key` to the `"[json_filename]"` in the template. + +## Steps to train MultiWorkerMirrored Strategy based examples + +The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) +using the tensorflow 2.0 API on the Kubernetes platform. Reference programs +such as [keras_mnist.py](../docker/keras_mnist.py) and +[custom_training_mnist.py](../docker/custom_training_mnist.py) are available in the docker directory. + +### Prerequisites + +1. [Jinja templates](http://jinja.pocoo.org/) must be installed. + +2. A Kubernetes cluster running Kubernetes 1.15 or above must be available. To create a test +cluster on the local machine, [follow steps here](https://kubernetes.io/docs/tutorials/kubernetes-basics/create-cluster/). Kubernetes clusters can also be created on all major cloud providers. For instance, +here are instructions to [create GKE clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-regional-cluster). Make sure that you have atleast 12 G of RAM between all nodes in the clusters. + +3. For model storage and checkpointing, a [persistent-volume-claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) needs to be available to mount onto the chief worker pod. The steps below include the yaml to create a persistent-volume-claim for GKE backed by GCEPersistentDisk. + +### Steps to Run the job + +1. Follow the instructions in the parent [README](../README.md) to create a training program. + Sample training programs are already provided in the [docker](../docker) directory. + +2. Follow the instructions for building and pushing the Docker image to a docker registry + in the [Docker README](../docker/README.md). + +3. Copy the template file: + + ```sh + cp kubernetes/MultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja + ``` + +4. Edit the `myjob.template.jinja` file to edit job parameters. + 1. `script` - which training program needs to be run. This should be either + `keras_mnist.py` or `custom_training_mnist.py` or `your_own_training_example.py` + + 2. `name` - the prefix attached to all the Kubernetes jobs created + + 3. `worker_replicas` - number of parallel worker processes that train the example + + 4. `port` - the port used by tensorflow worker processes to communicate with each other + + 5. `model_checkpoint_dir` - directory where the model is checkpointed and saved from the chief worker process. + + 6. `checkpoint_pvc_name` - name of the persistent-volume-claim which should be mounted at `model_checkpoint_dir`. This volume will contain the checkpointed model. + + 7. `image` - name of the docker image created in step 2 that needs to be loaded onto the cluster + + 8. `deploy` - set to True when the manifest is actually expected to be deployed + + 9. `create_pvc_checkpoint` - Creates a ReadWriteOnce persistent volume claim to checkpoint the model if needed. The name of the claim `checkpoint_pvc_name` should also be specified. + + 10. `create_volume_inspector` - Create a pod to inspect the contents of the volume after the training job is complete. If this is `True`, `deploy` cannot be `True` since the checkpoint volume can be mounted as read-write by a single node. Inspection cannot happen when training is happenning. + +5. Run the job: + 1. Create a namespace to run your training jobs + + ```sh + kubectl create namespace + ``` + + 2. [Optional] First set `deploy` to `False`, `create_pvc_checkpoint` to `True` and set the name of `checkpoint_pvc_name` appropriately. Then run + + ```sh + python render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + This will create a persistent volume claim where you can checkpoint your image. + + 3. Set `deploy` to `True` with all parameters specified in step 4 and then run + + ```sh + python render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + This will create the Kubernetes jobs on the clusters. Each Job has a single service-endpoint and a single pod that runs the training image. You can track the running jobs in the cluster by running + + ```sh + kubectl get jobs -n + kubectl describe jobs -n + ``` + + In order to inspect the trainining logs that are running in the jobs, run + + ```sh + # Shows all the running pods + kubectl get pods -n + kubectl logs -n -p + ``` + + 4. Once the jobs are finished (based on the logs/output of kubectl get jobs), + the trained model can be inspected by a volume inspector pod. Set `deploy` to `False` + and `create_volume_inspector` to True. Then run + + ```sh + python render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + Then, access the pod through ssh + + ```sh + kubectl get pods -n + kubectl -n exec --stdin --tty -- /bin/bash + ``` + + The contents of the trained model are available for inspection at `model_checkpoint_dir` From 137ea6a1bed0f364780e9b234428f5aa6e8a2592 Mon Sep 17 00:00:00 2001 From: shankgan Date: Tue, 23 Feb 2021 08:24:44 -0800 Subject: [PATCH 08/11] Moving MultiWorkerTraining Examples and Documentation to the distribution_strategy folder --- README.md | 33 +---- .../multi_worker_mirrored_strategy/README.md | 117 ++++++++++++++++++ .../examples/Dockerfile | 13 ++ .../examples/README.md | 53 ++++++++ .../examples}/custom_training_mnist.py | 0 .../examples}/keras_mnist.py | 0 .../MultiWorkerMirroredTemplate.jinja | 0 docker/Dockerfile | 11 +- docker/Dockerfile.hdfs | 4 +- docker/README.md | 23 +--- kubernetes/README.md | 110 +--------------- 11 files changed, 193 insertions(+), 171 deletions(-) create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/README.md create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/examples/README.md rename {docker => distribution_strategy/multi_worker_mirrored_strategy/examples}/custom_training_mnist.py (100%) rename {docker => distribution_strategy/multi_worker_mirrored_strategy/examples}/keras_mnist.py (100%) rename {kubernetes => distribution_strategy/multi_worker_mirrored_strategy/kubernetes}/MultiWorkerMirroredTemplate.jinja (100%) diff --git a/README.md b/README.md index 2e3717c0..e2698296 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ request. - [docker](docker) - Docker configuration for running TensorFlow on cluster managers. - [kubeflow](https://github.com/kubeflow/kubeflow) - A Kubernetes native platform for ML - * A K8s custom resource for running distributed [TensorFlow jobs](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#submitting-a-tensorflow-training-job) + * A K8s custom resource for running distributed [TensorFlow jobs](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#submitting-a-tensorflow-training-job) * Jupyter images for different versions of TensorFlow * [TFServing](https://github.com/kubeflow/kubeflow/blob/master/user_guide.md#serve-a-model-using-tensorflow-serving) Docker images and K8s templates - [kubernetes](kubernetes) - Templates for running distributed TensorFlow on - Kubernetes. + Kubernetes. For the most upto-date examples, please also refer to the [distribution strategy](distribution_strategy) folder. - [marathon](marathon) - Templates for running distributed TensorFlow using Marathon, deployed on top of Mesos. - [hadoop](hadoop) - TFRecord file InputFormat/OutputFormat for Hadoop MapReduce @@ -26,36 +26,12 @@ request. ## Distributed TensorFlow -### Tensorflow 2 - -For distributed training, the tensorflow server is implicitly started. -The main configuration required by the tensorflow libraries is the cluster and local process configuration -that can be passed as an environment variable. -Refer to [Distributed TensorFlow Concepts](https://www.tensorflow.org/guide/distributed_training) for concepts. -Refer to [Distributed TensorFlow Examples](https://www.tensorflow.org/tutorials/distribute/keras) for examples. - -#### Sample TF_CONFIG cluster configuration for distributed training - -```python -os.environ["TF_CONFIG"] = json.dumps({ - "cluster": { - "worker": ["host1:port", "host2:port", "host3:port"], # Worker IP/Port locations - "ps": ["host4:port", "host5:port"], # Parameter Server IP/Port Locations - "chief": ["host6:port"] # Chief worker location - }, - "task": {"type": "worker", "index": 1} # Current Process configuration -}) -``` - - -### Tensorflow 1 - See the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) documentation for a description of how it works. The examples in this repository focus on the most common form of distributed training: between-graph replication with asynchronous updates. -#### Common Setup for distributed training +### Common Setup for distributed training Every distributed training program has some common setup. First, define flags so that the worker knows about other workers and knows what role it plays in @@ -97,8 +73,7 @@ if FLAGS.job_name == "ps": Afterwards, your code varies depending on the form of distributed training you intend on doing. The most common form is between-graph replication. -#### Between-graph Replication - +### Between-graph Replication In this mode, each worker separately constructs the exact same graph. Each worker then runs the graph in isolation, only sharing gradients with the diff --git a/distribution_strategy/multi_worker_mirrored_strategy/README.md b/distribution_strategy/multi_worker_mirrored_strategy/README.md new file mode 100644 index 00000000..406804ff --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/README.md @@ -0,0 +1,117 @@ + +# MultiWorkerMirrored Training Strategy with examples + +The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) using the tensorflow 2.0 API on the Kubernetes platform. + +Reference programs such as [keras_mnist.py](examples/keras_mnist.py) and +[custom_training_mnist.py](examples/custom_training_mnist.py) are available in the examples directory. + +The Kubernetes manifest templates and other cluster specific configuration is available in the [kubernetes](kubernetes) directory + +## Prerequisites + +1. (Optional) It is recommended that you have a Google Cloud project. Either create a new project or use an existing one. Install + [gcloud commandline tools](https://cloud.google.com/functions/docs/quickstart) + on your system, login, set project and zone, etc. + +2. [Jinja templates](http://jinja.pocoo.org/) must be installed. + +3. A Kubernetes cluster running Kubernetes 1.15 or above must be available. To create a test +cluster on the local machine, [follow steps here](https://kubernetes.io/docs/tutorials/kubernetes-basics/create-cluster/). Kubernetes clusters can also be created on all major cloud providers. For instance, +here are instructions to [create GKE clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-regional-cluster). Make sure that you have atleast 12 G of RAM between all nodes in the clusters. This should also install the `kubectl` tool on your system + +4. Set context for `kubectl` so that `kubectl` knows which cluster to use: + + ```bash + kubectl config use-context + ``` + +5. Install [Docker](https://docs.docker.com/get-docker/) for your system, while also creating an account that you can associate with your container images. + +6. For model storage and checkpointing, a [persistent-volume-claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) needs to be available to mount onto the chief worker pod. The steps below include the yaml to create a persistent-volume-claim for GKE backed by GCEPersistentDisk. + +### Steps to Run the job + +1. Follow the instructions for building and pushing the Docker image to a docker registry + in the [Docker README](examples/README.md). + +2. Copy the template file: + + ```sh + cp kubernetes/MultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja + ``` + +4. Edit the `myjob.template.jinja` file to edit job parameters. + 1. `script` - which training program needs to be run. This should be either + `keras_mnist.py` or `custom_training_mnist.py` or `your_own_training_example.py` + + 2. `name` - the prefix attached to all the Kubernetes jobs created + + 3. `worker_replicas` - number of parallel worker processes that train the example + + 4. `port` - the port used by tensorflow worker processes to communicate with each other + + 5. `model_checkpoint_dir` - directory where the model is checkpointed and saved from the chief worker process. + + 6. `checkpoint_pvc_name` - name of the persistent-volume-claim which should be mounted at `model_checkpoint_dir`. This volume will contain the checkpointed model. + + 7. `image` - name of the docker image created in step 2 that needs to be loaded onto the cluster + + 8. `deploy` - set to True when the manifest is actually expected to be deployed + + 9. `create_pvc_checkpoint` - Creates a ReadWriteOnce persistent volume claim to checkpoint the model if needed. The name of the claim `checkpoint_pvc_name` should also be specified. + + 10. `create_volume_inspector` - Create a pod to inspect the contents of the volume after the training job is complete. If this is `True`, `deploy` cannot be `True` since the checkpoint volume can be mounted as read-write by a single node. Inspection cannot happen when training is happenning. + +5. Run the job: + 1. Create a namespace to run your training jobs + + ```sh + kubectl create namespace + ``` + + 2. [Optional] First set `deploy` to `False`, `create_pvc_checkpoint` to `True` and set the name of `checkpoint_pvc_name` appropriately. Then run + + ```sh + python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + This will create a persistent volume claim where you can checkpoint your image. + + 3. Set `deploy` to `True` with all parameters specified in step 4 and then run + + ```sh + python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + This will create the Kubernetes jobs on the clusters. Each Job has a single service-endpoint and a single pod that runs the training image. You can track the running jobs in the cluster by running + + ```sh + kubectl get jobs -n + kubectl describe jobs -n + ``` + + In order to inspect the trainining logs that are running in the jobs, run + + ```sh + # Shows all the running pods + kubectl get pods -n + kubectl logs -n -p + ``` + + 4. Once the jobs are finished (based on the logs/output of kubectl get jobs), + the trained model can be inspected by a volume inspector pod. Set `deploy` to `False` + and `create_volume_inspector` to True. Then run + + ```sh + python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + ``` + + Then, access the pod through ssh + + ```sh + kubectl get pods -n + kubectl -n exec --stdin --tty -- /bin/bash + ``` + + The contents of the trained model are available for inspection at `model_checkpoint_dir`. \ No newline at end of file diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile new file mode 100644 index 00000000..3510fe7a --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile @@ -0,0 +1,13 @@ +FROM tensorflow/tensorflow:nightly + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY . /app/ + +ENTRYPOINT ["python", "/keras_mnist.py"] \ No newline at end of file diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md b/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md new file mode 100644 index 00000000..3051a0dd --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md @@ -0,0 +1,53 @@ +# TensorFlow Docker Images + +This directory contains examples of MultiWorkerMirrored Training along with the docker file to build them + +- [Dockerfile](Dockerfile) contains all dependenices required to build a container image using docker with the training examples +- [keras_mnist.py](mnist.py) demonstrates how to train an MNIST classifier using + [tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). +- [custom_training_mnist.py](mnist.py) demonstrates how to train a fashion MNIST classifier using + [tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). + +## Best Practices + +- Always pin the TensorFlow version with the Docker image tag. This ensures that + TensorFlow updates don't adversely impact your training program for future + runs. +- When creating an image, specify version tags (see below). If you make code + changes, increment the version. Cluster managers will not pull an updated + Docker image if they have them cached. Also, versions ensure that you have + a single copy of the code running for each job. + +## Building the Docker Files + +Ensure that docker is installed on your system. + +First, pick an image name for the job. When running on a cluster manager, you +will want to push your images to a container registry. Note that both the +[Google Container Registry](https://cloud.google.com/container-registry/) +and the [Amazon EC2 Container Registry](https://aws.amazon.com/ecr/) require +special paths. We append `:v1` to version our images. Versioning images is +strongly recommended for reasons described in the best practices section. + +```sh +docker build -t :v1 -f Dockerfile . +# Use gcloud docker push instead if on Google Container Registry. +docker push :v1 +``` + +If you make any updates to the code, increment the version and rerun the above +commands with the new version. + +## Running the keras_mnist.py example + +The [keras_mnist.py](keras_mnist.py) example demonstrates how to train an MNIST classifier using +[tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). +The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. +It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster + +## Running the custom_training_mnist.py example + +The [custom_training_mnist.py](mnist.py) example demonstrates how to train a fashion MNIST classifier using +[tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). +The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. +It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster. diff --git a/docker/custom_training_mnist.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py similarity index 100% rename from docker/custom_training_mnist.py rename to distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py diff --git a/docker/keras_mnist.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py similarity index 100% rename from docker/keras_mnist.py rename to distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py diff --git a/kubernetes/MultiWorkerMirroredTemplate.jinja b/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja similarity index 100% rename from kubernetes/MultiWorkerMirroredTemplate.jinja rename to distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja diff --git a/docker/Dockerfile b/docker/Dockerfile index 11a10c62..5a69d1ff 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,13 +1,4 @@ FROM tensorflow/tensorflow:nightly -# Keeps Python from generating .pyc files in the container -ENV PYTHONDONTWRITEBYTECODE=1 - -# Turns off buffering for easier container logging -ENV PYTHONUNBUFFERED=1 - -WORKDIR /app - -COPY . /app/ - +COPY mnist.py / ENTRYPOINT ["python", "/mnist.py"] diff --git a/docker/Dockerfile.hdfs b/docker/Dockerfile.hdfs index 6e19fed6..5357b682 100644 --- a/docker/Dockerfile.hdfs +++ b/docker/Dockerfile.hdfs @@ -23,8 +23,6 @@ ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server ENV CLASSPATH /usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/httpcore-4.2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-configuration-1.6.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-xc-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/gson-2.2.4.jar:/usr/local/hadoop/share/hadoop/common/lib/snappy-java-1.0.4.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jaxb-api-2.2.2.jar:/usr/local/hadoop/share/hadoop/common/lib/paranamer-2.3.jar:/usr/local/hadoop/share/hadoop/common/lib/apacheds-kerberos-codec-2.0.0-M15.jar:/usr/local/hadoop/share/hadoop/common/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/common/lib/hadoop-annotations-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/lib/api-asn1-api-1.0.0-M20.jar:/usr/local/hadoop/share/hadoop/common/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/common/lib/java-xmlbuilder-0.4.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/common/lib/slf4j-api-1.7.10.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/common/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/jsp-api-2.1.jar:/usr/local/hadoop/share/hadoop/common/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-recipes-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-beanutils-1.7.0.jar:/usr/local/hadoop/share/hadoop/common/lib/mockito-all-1.8.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-client-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-json-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-jaxrs-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-httpclient-3.1.jar:/usr/local/hadoop/share/hadoop/common/lib/zookeeper-3.4.6.jar:/usr/local/hadoop/share/hadoop/common/lib/curator-framework-2.7.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-net-3.1.jar:/usr/local/hadoop/share/hadoop/common/lib/xmlenc-0.52.jar:/usr/local/hadoop/share/hadoop/common/lib/avro-1.7.4.jar:/usr/local/hadoop/share/hadoop/common/lib/jettison-1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/api-util-1.0.0-M20.jar:/usr/local/hadoop/share/hadoop/common/lib/activation-1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/common/lib/stax-api-1.0-2.jar:/usr/local/hadoop/share/hadoop/common/lib/apacheds-i18n-2.0.0-M15.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/common/lib/hadoop-auth-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-beanutils-core-1.8.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-collections-3.2.2.jar:/usr/local/hadoop/share/hadoop/common/lib/junit-4.11.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-digester-1.8.jar:/usr/local/hadoop/share/hadoop/common/lib/hamcrest-core-1.3.jar:/usr/local/hadoop/share/hadoop/common/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar:/usr/local/hadoop/share/hadoop/common/lib/jsch-0.1.42.jar:/usr/local/hadoop/share/hadoop/common/lib/jaxb-impl-2.2.3-1.jar:/usr/local/hadoop/share/hadoop/common/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/common/lib/httpclient-4.2.5.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/common/lib/htrace-core-3.1.0-incubating.jar:/usr/local/hadoop/share/hadoop/common/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/common/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/common/lib/commons-math3-3.1.1.jar:/usr/local/hadoop/share/hadoop/common/lib/jets3t-0.9.0.jar:/usr/local/hadoop/share/hadoop/common/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/common/hadoop-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/common/hadoop-common-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/common/hadoop-nfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-daemon-1.0.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xercesImpl-2.9.1.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/netty-all-4.0.23.Final.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xmlenc-0.52.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/htrace-core-3.1.0-incubating.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/xml-apis-1.3.04.jar:/usr/local/hadoop/share/hadoop/hdfs/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/hdfs/hadoop-hdfs-nfs-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-xc-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jaxb-api-2.2.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-client-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/yarn/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/aopalliance-1.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jetty-util-6.1.26.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-cli-1.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/servlet-api-2.5.jar:/usr/local/hadoop/share/hadoop/yarn/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/javax.inject-1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-lang-2.6.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-json-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-jaxrs-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-guice-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jettison-1.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6-tests.jar:/usr/local/hadoop/share/hadoop/yarn/lib/activation-1.1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-codec-1.4.jar:/usr/local/hadoop/share/hadoop/yarn/lib/stax-api-1.0-2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guice-3.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guice-servlet-3.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jetty-6.1.26.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-collections-3.2.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jaxb-impl-2.2.3-1.jar:/usr/local/hadoop/share/hadoop/yarn/lib/guava-11.0.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/commons-logging-1.1.3.jar:/usr/local/hadoop/share/hadoop/yarn/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/yarn/lib/jsr305-3.0.0.jar:/usr/local/hadoop/share/hadoop/yarn/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-tests-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-api-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-nodemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-applicationhistoryservice-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-registry-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-sharedcachemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-client-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-applications-unmanaged-am-launcher-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-resourcemanager-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-applications-distributedshell-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/yarn/hadoop-yarn-server-web-proxy-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/snappy-java-1.0.4.1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/paranamer-2.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/netty-3.6.2.Final.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/hadoop-annotations-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/xz-1.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/aopalliance-1.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/protobuf-java-2.5.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/commons-io-2.4.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/commons-compress-1.4.1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/javax.inject-1.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/leveldbjni-all-1.8.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-guice-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/avro-1.7.4.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-mapper-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/guice-3.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/guice-servlet-3.0.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-server-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jackson-core-asl-1.9.13.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/junit-4.11.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/hamcrest-core-1.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/jersey-core-1.9.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/asm-3.2.jar:/usr/local/hadoop/share/hadoop/mapreduce/lib/log4j-1.2.17.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-plugins-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-common-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-app-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.3-tests.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.3.jar:/usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-shuffle-2.7.3.jar:/contrib/capacity-scheduler/*.jar -WORKDIR /app - -COPY . /app/ +COPY mnist.py / ENTRYPOINT ["python", "/mnist.py"] diff --git a/docker/README.md b/docker/README.md index 427cc02f..6379a19a 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,16 +4,11 @@ This directory contains example Dockerfiles to run TensorFlow on cluster managers. - [Dockerfile](Dockerfile) is the most basic example, which just adds a Python - training program on top of the tensorflow/tensorflow Docker image. All training programs - in this directory will be copied into docker image + training program on top of the tensorflow/tensorflow Docker image. - [Dockerfile.hdfs](Dockerfile.hdfs) installs Hadoop libraries and sets the appropriate environment variables to enable reading from HDFS. - [mnist.py](mnist.py) demonstrates the programmatic setup for distributed - TensorFlow training using the tensorflow 1.x API. -- [keras_mnist.py](mnist.py) demonstrates how to train an MNIST classifier using - [tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). -- [custom_training_mnist.py](mnist.py) demonstrates how to train a fashion MNIST classifier using - [tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). + TensorFlow training. ## Best Practices @@ -51,17 +46,3 @@ program to convert mnist data to TFRecords. When running distributed TensorFlow, you should upload the converted data to a common location on distributed storage, such as GCS or HDFS. - -## Running the keras_mnist.py example - -The [keras_mnist.py](keras_mnist.py) example demonstrates how to train an MNIST classifier using -[tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). -The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. -It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster - -## Running the custom_training_mnist.py example - -The [custom_training_mnist.py](mnist.py) example demonstrates how to train a fashion MNIST classifier using -[tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). -The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. -It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster. diff --git a/kubernetes/README.md b/kubernetes/README.md index 2d678d32..1aefd277 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -1,7 +1,7 @@ # Running Distributed TensorFlow on Kubernetes This directory contains a template for running distributed TensorFlow on -Kubernetes. +Kubernetes. For newer examples, refer to the [distribution strategy](../distribution_strategy) ## Steps to train [mnist.py](../docker/mnist.py) @@ -64,110 +64,4 @@ credentials. 3. In your template, set `credential_secret_name` to `"credential"` (as specified above) and `credential_secret_key` to the `"[json_filename]"` in - the template. - -## Steps to train MultiWorkerMirrored Strategy based examples - -The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) -using the tensorflow 2.0 API on the Kubernetes platform. Reference programs -such as [keras_mnist.py](../docker/keras_mnist.py) and -[custom_training_mnist.py](../docker/custom_training_mnist.py) are available in the docker directory. - -### Prerequisites - -1. [Jinja templates](http://jinja.pocoo.org/) must be installed. - -2. A Kubernetes cluster running Kubernetes 1.15 or above must be available. To create a test -cluster on the local machine, [follow steps here](https://kubernetes.io/docs/tutorials/kubernetes-basics/create-cluster/). Kubernetes clusters can also be created on all major cloud providers. For instance, -here are instructions to [create GKE clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-regional-cluster). Make sure that you have atleast 12 G of RAM between all nodes in the clusters. - -3. For model storage and checkpointing, a [persistent-volume-claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) needs to be available to mount onto the chief worker pod. The steps below include the yaml to create a persistent-volume-claim for GKE backed by GCEPersistentDisk. - -### Steps to Run the job - -1. Follow the instructions in the parent [README](../README.md) to create a training program. - Sample training programs are already provided in the [docker](../docker) directory. - -2. Follow the instructions for building and pushing the Docker image to a docker registry - in the [Docker README](../docker/README.md). - -3. Copy the template file: - - ```sh - cp kubernetes/MultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja - ``` - -4. Edit the `myjob.template.jinja` file to edit job parameters. - 1. `script` - which training program needs to be run. This should be either - `keras_mnist.py` or `custom_training_mnist.py` or `your_own_training_example.py` - - 2. `name` - the prefix attached to all the Kubernetes jobs created - - 3. `worker_replicas` - number of parallel worker processes that train the example - - 4. `port` - the port used by tensorflow worker processes to communicate with each other - - 5. `model_checkpoint_dir` - directory where the model is checkpointed and saved from the chief worker process. - - 6. `checkpoint_pvc_name` - name of the persistent-volume-claim which should be mounted at `model_checkpoint_dir`. This volume will contain the checkpointed model. - - 7. `image` - name of the docker image created in step 2 that needs to be loaded onto the cluster - - 8. `deploy` - set to True when the manifest is actually expected to be deployed - - 9. `create_pvc_checkpoint` - Creates a ReadWriteOnce persistent volume claim to checkpoint the model if needed. The name of the claim `checkpoint_pvc_name` should also be specified. - - 10. `create_volume_inspector` - Create a pod to inspect the contents of the volume after the training job is complete. If this is `True`, `deploy` cannot be `True` since the checkpoint volume can be mounted as read-write by a single node. Inspection cannot happen when training is happenning. - -5. Run the job: - 1. Create a namespace to run your training jobs - - ```sh - kubectl create namespace - ``` - - 2. [Optional] First set `deploy` to `False`, `create_pvc_checkpoint` to `True` and set the name of `checkpoint_pvc_name` appropriately. Then run - - ```sh - python render_template.py myjob.template.jinja | kubectl create -n -f - - ``` - - This will create a persistent volume claim where you can checkpoint your image. - - 3. Set `deploy` to `True` with all parameters specified in step 4 and then run - - ```sh - python render_template.py myjob.template.jinja | kubectl create -n -f - - ``` - - This will create the Kubernetes jobs on the clusters. Each Job has a single service-endpoint and a single pod that runs the training image. You can track the running jobs in the cluster by running - - ```sh - kubectl get jobs -n - kubectl describe jobs -n - ``` - - In order to inspect the trainining logs that are running in the jobs, run - - ```sh - # Shows all the running pods - kubectl get pods -n - kubectl logs -n -p - ``` - - 4. Once the jobs are finished (based on the logs/output of kubectl get jobs), - the trained model can be inspected by a volume inspector pod. Set `deploy` to `False` - and `create_volume_inspector` to True. Then run - - ```sh - python render_template.py myjob.template.jinja | kubectl create -n -f - - ``` - - Then, access the pod through ssh - - ```sh - kubectl get pods -n - kubectl -n exec --stdin --tty -- /bin/bash - ``` - - The contents of the trained model are available for inspection at `model_checkpoint_dir` + the template. \ No newline at end of file From b7fe9f944407fcd5edb83ea33a73c7a119f7c2e8 Mon Sep 17 00:00:00 2001 From: shankgan Date: Mon, 1 Mar 2021 07:42:05 -0800 Subject: [PATCH 09/11] Minor cosmetic changes --- .../multi_worker_mirrored_strategy/README.md | 26 +++++++++---------- .../examples/Dockerfile | 2 +- .../examples/keras_mnist.py | 2 +- .../MultiWorkerMirroredTemplate.jinja | 8 +++--- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/distribution_strategy/multi_worker_mirrored_strategy/README.md b/distribution_strategy/multi_worker_mirrored_strategy/README.md index 406804ff..161eaca1 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/README.md +++ b/distribution_strategy/multi_worker_mirrored_strategy/README.md @@ -41,7 +41,7 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet cp kubernetes/MultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja ``` -4. Edit the `myjob.template.jinja` file to edit job parameters. +3. Edit the `myjob.template.jinja` file to edit job parameters. 1. `script` - which training program needs to be run. This should be either `keras_mnist.py` or `custom_training_mnist.py` or `your_own_training_example.py` @@ -51,9 +51,9 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet 4. `port` - the port used by tensorflow worker processes to communicate with each other - 5. `model_checkpoint_dir` - directory where the model is checkpointed and saved from the chief worker process. + 5. `checkpoint_pvc_name` - name of the persistent-volume-claim that will contain the checkpointed model. - 6. `checkpoint_pvc_name` - name of the persistent-volume-claim which should be mounted at `model_checkpoint_dir`. This volume will contain the checkpointed model. + 6. `model_checkpoint_dir` - mount location for inspecting the trained model in the volume inspector pod. Meant to be set if Volume inspector pod is mounted. 7. `image` - name of the docker image created in step 2 that needs to be loaded onto the cluster @@ -63,25 +63,25 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet 10. `create_volume_inspector` - Create a pod to inspect the contents of the volume after the training job is complete. If this is `True`, `deploy` cannot be `True` since the checkpoint volume can be mounted as read-write by a single node. Inspection cannot happen when training is happenning. -5. Run the job: +4. Run the job: 1. Create a namespace to run your training jobs ```sh kubectl create namespace ``` - 2. [Optional] First set `deploy` to `False`, `create_pvc_checkpoint` to `True` and set the name of `checkpoint_pvc_name` appropriately. Then run + 2. [Optional: If Persistent volume does not already exist on cluster] First set `deploy` to `False`, `create_pvc_checkpoint` to `True` and set the name of `checkpoint_pvc_name` appropriately in the .jinja file. Then run ```sh - python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + python ../../render_template.py myjob.template.jinja | kubectl apply -n -f - ``` - This will create a persistent volume claim where you can checkpoint your image. + This will create a persistent volume claim where you can checkpoint your image. In GKE, this claim will auto-create a GCE persistent disk resource to back up the claim. - 3. Set `deploy` to `True` with all parameters specified in step 4 and then run + 3. Set `deploy` to `True`, `create_pvc_checkpoint` to `False`, with all parameters specified in step 4 and then run ```sh - python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + python ../../render_template.py myjob.template.jinja | kubectl apply -n -f - ``` This will create the Kubernetes jobs on the clusters. Each Job has a single service-endpoint and a single pod that runs the training image. You can track the running jobs in the cluster by running @@ -101,17 +101,17 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet 4. Once the jobs are finished (based on the logs/output of kubectl get jobs), the trained model can be inspected by a volume inspector pod. Set `deploy` to `False` - and `create_volume_inspector` to True. Then run + and `create_volume_inspector` to True. Also set `model_checkpoint_dir` to indicate location where trained model will be mounted. Then run ```sh - python ../../render_template.py myjob.template.jinja | kubectl create -n -f - + python ../../render_template.py myjob.template.jinja | kubectl apply -n -f - ``` - Then, access the pod through ssh + This will create the volume inspector pod. Then, access the pod through ssh ```sh kubectl get pods -n - kubectl -n exec --stdin --tty -- /bin/bash + kubectl -n exec --stdin --tty -- /bin/sh ``` The contents of the trained model are available for inspection at `model_checkpoint_dir`. \ No newline at end of file diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile index 3510fe7a..36aa8034 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile @@ -10,4 +10,4 @@ WORKDIR /app COPY . /app/ -ENTRYPOINT ["python", "/keras_mnist.py"] \ No newline at end of file +ENTRYPOINT ["python", "keras_mnist.py"] diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py index 9e2e5e9c..288fed89 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py @@ -102,7 +102,7 @@ def main(): callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=write_filepath(strategy))] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() - multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70, + multi_worker_model.fit(multi_worker_dataset, epochs=10, steps_per_epoch=70, callbacks=callbacks) multi_worker_model.save(filepath=write_filepath(strategy)) diff --git a/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja b/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja index f0db6d10..e4a7799c 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja +++ b/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/MultiWorkerMirroredTemplate.jinja @@ -1,3 +1,4 @@ +{%- set name = "tf-learning" -%} {%- set image = "image-name" -%} {%- set worker_replicas = 2 -%} {%- set script = "keras_mnist.py" -%} @@ -44,6 +45,7 @@ spec: job: worker task: "{{ i }}" spec: + restartPolicy: Never containers: - name: tensorflow image: {{ image }} @@ -55,12 +57,9 @@ spec: env: - name: TF_CONFIG value: '{"cluster": {"worker": [{{ worker_hosts() }}]}, "task": {"type": "worker", "index": {{ i }}}}' - args: - - "--model_checkpoint_dir={{ model_checkpoint_dir }}" - restartPolicy: Never {% if i == 0 %} volumeMounts: - - mountPath: "{{ model_checkpoint_dir }}" + - mountPath: /pvcmnt name: pvc-mount volumes: - name: pvc-mount @@ -103,7 +102,6 @@ spec: resources: limits: memory: 512Mi - cpu: "1" --- {% endif %} From 2fb34e46bb4a55019529aeec7b5482a584481565 Mon Sep 17 00:00:00 2001 From: shankgan Date: Thu, 1 Apr 2021 06:15:41 -0700 Subject: [PATCH 10/11] Incorporate Review comments --- .../multi_worker_mirrored_strategy/README.md | 2 +- .../examples/custom_training_mnist.py | 8 +++----- .../examples/keras_mnist.py | 10 +++++----- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/distribution_strategy/multi_worker_mirrored_strategy/README.md b/distribution_strategy/multi_worker_mirrored_strategy/README.md index 161eaca1..e0102587 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/README.md +++ b/distribution_strategy/multi_worker_mirrored_strategy/README.md @@ -1,7 +1,7 @@ # MultiWorkerMirrored Training Strategy with examples -The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) using the tensorflow 2.0 API on the Kubernetes platform. +The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) using the tensorflow 2.x API on the Kubernetes platform. Reference programs such as [keras_mnist.py](examples/keras_mnist.py) and [custom_training_mnist.py](examples/custom_training_mnist.py) are available in the examples directory. diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py index fc4f4295..7a0c2e05 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/custom_training_mnist.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================== -# This code serves as an example of using Tensorflow 2.0 to build and train a CNN model on the +# This code serves as an example of using Tensorflow 2.x to build and train a CNN model on the # Fashion MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here # https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy # using a custom training loop. This code is very similar to the example provided here @@ -33,8 +33,7 @@ MAIN_MODEL_PATH = '/pvcmnt' EPOCHS = 10 -BATCH_SIZE_PER_REPLICA = 64 -GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA +GLOBAL_BATCH_SIZE = 128 def _is_chief(task_type, task_id): # If `task_type` is None, this may be operating as single worker, which works @@ -92,7 +91,6 @@ def get_dist_data_set(strategy, batch_size): def main(): global GLOBAL_BATCH_SIZE strategy = tf.distribute.MultiWorkerMirroredStrategy() - GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync train_dist_dataset, test_dist_dataset = get_dist_data_set(strategy, GLOBAL_BATCH_SIZE) checkpoint_pfx = write_filepath(strategy) with strategy.scope(): diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py index 288fed89..41882c74 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_mnist.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================== -# This code serves as an example of using Tensorflow 2.0 Keras API to build and train a CNN model on the +# This code serves as an example of using Tensorflow 2.x Keras API to build and train a CNN model on the # MNIST dataset using the tf.distribute.MultiWorkerMirroredStrategy described here # https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy. # This code is very similar to the example provided here @@ -39,6 +39,8 @@ # Model save directory MAIN_MODEL_PATH = '/pvcmnt' +GLOBAL_BATCH_SIZE = 128 + def _is_chief(task_type, task_id): # If `task_type` is None, this may be operating as single worker, which works # effectively as chief. @@ -88,13 +90,11 @@ def build_and_compile_cnn_model(): return model def main(): - per_worker_batch_size = 64 tf_config = json.loads(os.environ['TF_CONFIG']) num_workers = len(tf_config['cluster']['worker']) strategy = tf.distribute.MultiWorkerMirroredStrategy() - global_batch_size = per_worker_batch_size * num_workers - multi_worker_dataset = mnist_dataset(global_batch_size) + multi_worker_dataset = mnist_dataset(GLOBAL_BATCH_SIZE) # missing needs to be fixed # multi_worker_dataset = strategy.distribute_datasets_from_function(mnist_dataset(global_batch_size)) From 598bb4d59393ba4961fd78404e6b025586b9a7f0 Mon Sep 17 00:00:00 2001 From: shankgan Date: Fri, 14 May 2021 10:23:05 -0700 Subject: [PATCH 11/11] Adding example to train the resnet56 model using MultiworkerMirroredTraining example on the cifar-10 dataset --- .../multi_worker_mirrored_strategy/README.md | 120 +++++- .../examples/Dockerfile.gpu | 30 ++ .../examples/README.md | 11 +- .../examples/keras_resnet_cifar.py | 373 ++++++++++++++++++ .../EnhancedMultiWorkerMirroredTemplate.j2 | 142 +++++++ 5 files changed, 670 insertions(+), 6 deletions(-) create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile.gpu create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/examples/keras_resnet_cifar.py create mode 100644 distribution_strategy/multi_worker_mirrored_strategy/kubernetes/EnhancedMultiWorkerMirroredTemplate.j2 diff --git a/distribution_strategy/multi_worker_mirrored_strategy/README.md b/distribution_strategy/multi_worker_mirrored_strategy/README.md index e0102587..1c68d154 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/README.md +++ b/distribution_strategy/multi_worker_mirrored_strategy/README.md @@ -4,7 +4,7 @@ The steps below are meant to train models using [MultiWorkerMirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) using the tensorflow 2.x API on the Kubernetes platform. Reference programs such as [keras_mnist.py](examples/keras_mnist.py) and -[custom_training_mnist.py](examples/custom_training_mnist.py) are available in the examples directory. +[custom_training_mnist.py](examples/custom_training_mnist.py) and [keras_resnet_cifar.py](examples/keras_resnet_cifar.py) are available in the examples directory. The Kubernetes manifest templates and other cluster specific configuration is available in the [kubernetes](kubernetes) directory @@ -28,14 +28,39 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet 5. Install [Docker](https://docs.docker.com/get-docker/) for your system, while also creating an account that you can associate with your container images. -6. For model storage and checkpointing, a [persistent-volume-claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) needs to be available to mount onto the chief worker pod. The steps below include the yaml to create a persistent-volume-claim for GKE backed by GCEPersistentDisk. +6. For the mnist examples, for model storage and checkpointing, a [persistent-volume-claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) needs to be available to mount onto the chief worker pod. The steps below include the yaml to create a persistent-volume-claim for GKE backed by GCEPersistentDisk. -### Steps to Run the job +### Additional prerequisites for resnet56 example + +1. Create a + [service account](https://cloud.google.com/compute/docs/access/service-accounts) + and download its key file in JSON format. Assign Storage Admin role for + [Google Cloud Storage](https://cloud.google.com/storage/) to this service account: + + ```bash + gcloud iam service-accounts create --display-name="" + ``` + + ```bash + gcloud projects add-iam-policy-binding \ + --member="serviceAccount:@.iam.gserviceaccount.com" \ + --role="roles/storage.admin" + ``` +2. Create a Kubernetes secret from the JSON key file of your service account: + + ```bash + kubectl create secret generic credential --from-file=key.json= + ``` + +3. For GPU based training, ensure your kubernetes cluster has a node-pool with gpu enabled. + The steps to achieve this on GKE are available [here](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus) + +## Steps to train mnist examples 1. Follow the instructions for building and pushing the Docker image to a docker registry in the [Docker README](examples/README.md). -2. Copy the template file: +2. Copy the template file `MultiWorkerMirroredTemplate.yaml.jinja`: ```sh cp kubernetes/MultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja @@ -114,4 +139,89 @@ here are instructions to [create GKE clusters](https://cloud.google.com/kubernet kubectl -n exec --stdin --tty -- /bin/sh ``` - The contents of the trained model are available for inspection at `model_checkpoint_dir`. \ No newline at end of file + The contents of the trained model are available for inspection at `model_checkpoint_dir`. + +## Steps to train resnet examples + +1. Follow the instructions for building and pushing the Docker image using `Dockerfile.gpu` to a docker registry + in the [Docker README](examples/README.md). + +2. Copy the template file `EnhancedMultiWorkerMirroredTemplate.yaml.jinja` + + ```sh + cp kubernetes/EnhancedMultiWorkerMirroredTemplate.yaml.jinja myjob.template.jinja + ``` +3. Create three buckets for model data, checkpoints and training logs using either GCP web UI or gsutil tool (included with the gcloud tool you have installed above): + + ```bash + gsutil mb gs:// + ``` + You will use these bucket names to modify `data_dir`, `log_dir` and `model_dir` in step #4. + + +4. Download CIFAR-10 data and place them in your data_dir bucket. Head to the [ResNet in TensorFlow](https://github.com/tensorflow/models/tree/r1.13.0/official/resnet#cifar-10) directory to obtain CIFAR-10 data. Alternatively, you can use this [direct link](https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz) to download and extract the data yourself as well. + + ```bash + python cifar10_download_and_extract.py + ``` + + Upload the contents of cifar-10-batches-bin directory to your `data_dir` bucket. + + ```bash + gsutil -m cp cifar-10-batches-bin/* gs:/// + ``` + +5. Edit the `myjob.template.jinja` file to edit job parameters. + 1. `script` - which training program needs to be run. This should be either + `keras_resnet_cifar.py` or `your_own_training_example.py` + + 2. `name` - the prefix attached to all the Kubernetes jobs created + + 3. `worker_replicas` - number of parallel worker processes that train the example + + 4. `port` - the port used by tensorflow worker processes to communicate with each other. + + 5. `model_dir` - the GCP bucket path that stores the model checkoints `gs://model_dir/` + + 6. `image` - name of the docker image created in step 2 that needs to be loaded onto the cluster + + 7. `log_dir` - the GCP bucket path that where the logs are stored `gs://log_dir/` + + 8. `data_dir` - the GCP bucket path for the Cifar-10 dataset `gs://data_dir/` + + 9. `gcp_credential_secret` - the name of secret created in the kubernetes cluster that contains the service Account credentials + + 10. `batch_size` - the global batch size used for training + + 11. `num_train_epoch` - the number of training epochs + +4. Run the job: + 1. Create a namespace to run your training jobs + + ```sh + kubectl create namespace + ``` + + 2. Deploy the training workloads in the cluster + + ```sh + python ../../render_template.py myjob.template.jinja | kubectl apply -n -f - + ``` + + This will create the Kubernetes jobs on the clusters. Each Job has a single service-endpoint and a single pod that runs the training image. You can track the running jobs in the cluster by running + + ```sh + kubectl get jobs -n + kubectl describe jobs -n + ``` + + By default, this also deploys tensorboard on the cluster. + + ```sh + kubectl get services -n | grep tensorboard + ``` + + Note the external-ip corresponding to the service and the previously configured `port` in the yaml + The tensorboard service should be accessible through the web at `http://tensorboard-external-ip:port` + + 3. The final model should be available in the GCP bucket corresponding to `model_dir` configured in the yaml diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile.gpu b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile.gpu new file mode 100644 index 00000000..0ebb5928 --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/Dockerfile.gpu @@ -0,0 +1,30 @@ +FROM tensorflow/tensorflow:2.3.1-gpu-jupyter + +RUN apt-get install -y python3 && \ + apt install python3-pip + +RUN pip3 install absl-py && \ + pip3 install portpicker + +# Install git +RUN apt-get update && \ + apt-get install -y git && \ + apt-get install -y vim + +WORKDIR /app + +RUN git clone --single-branch --branch benchmark https://github.com/tensorflow/models.git && \ + mv models tensorflow_models && \ + git clone https://github.com/tensorflow/model-optimization.git && \ + mv model-optimization tensorflow_model_optimization + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +COPY . /app/ + +ENV PYTHONPATH "${PYTHONPATH}:/:/app/tensorflow_models" + +CMD ["python", "resnet_cifar_multiworker_strategy_keras.py"] \ No newline at end of file diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md b/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md index 3051a0dd..4b5f5682 100644 --- a/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/README.md @@ -3,11 +3,13 @@ This directory contains examples of MultiWorkerMirrored Training along with the docker file to build them - [Dockerfile](Dockerfile) contains all dependenices required to build a container image using docker with the training examples +- [Dockerfile.gpu](Dockerfile.gpu) contains all dependenices required to build a container image using docker with gpu and the tensorflow model garden - [keras_mnist.py](mnist.py) demonstrates how to train an MNIST classifier using [tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). - [custom_training_mnist.py](mnist.py) demonstrates how to train a fashion MNIST classifier using [tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). - +- [keras_resnet_cifar.py](keras_resnet_cifar.py) demonstrates how to train the resnet56 model on the Cifar-10 dataset using + [tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). ## Best Practices - Always pin the TensorFlow version with the Docker image tag. This ensures that @@ -51,3 +53,10 @@ The [custom_training_mnist.py](mnist.py) example demonstrates how to train a fas [tf.distribute.MultiWorkerMirroredStrategy and Tensorflow 2.0 Custom Training Loop APIs](https://www.tensorflow.org/tutorials/distribute/custom_training). The final model is saved to disk by the chief worker process. The disk is assumed to be mounted onto the running container by the cluster manager. It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster. + +## Running the keras_resnet_cifar.py example + +The [keras_resnet_cifar.py](keras_resnet_cifar.py) example demonstrates how to train a Resnet56 model on the cifar-10 dataset using +[tf.distribute.MultiWorkerMirroredStrategy and Keras Tensorflow 2.0 API](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras). +The final model is saved to the GCP storage bucket. +It assumes that the cluster configuration is passed in through the `TF_CONFIG` environment variable when deployed in the cluster. diff --git a/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_resnet_cifar.py b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_resnet_cifar.py new file mode 100644 index 00000000..ab0f0318 --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/examples/keras_resnet_cifar.py @@ -0,0 +1,373 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Runs a ResNet model on the Cifar-10 dataset.""" + +# This code serves as an example of using Tensorflow 2.0 Keras API to build and train a Resnet50 model on +# the Cifar 10 dataset using the tf.distribute.MultiWorkerMirroredStrategy described here +# https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy. +# This code is largely borrowed from +# https://github.com/tensorflow/models/blob/benchmark/official/benchmark/models/resnet_cifar_model.py +# with some minor tweaks to allow for training using GPU +# Assumptions: +# 1) The code assumes that the cluster configuration needed for the TF distribute strategy is available through the +# TF_CONFIG environment variable. See the link provided above for details +# 2) The libraries required to test this model are packaged into ./Dockerfile.gpu. Please refer to it + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Import libraries +from absl import app +from absl import flags +from absl import logging +import numpy as np +import tensorflow as tf +from tensorflow_models.official.benchmark.models import cifar_preprocessing +from tensorflow_models.official.benchmark.models import resnet_cifar_model +from tensorflow_models.official.benchmark.models import synthetic_util +from tensorflow_models.official.common import distribute_utils +from tensorflow_models.official.utils.flags import core as flags_core +#from tensorflow_models.official.utils.misc import keras_utils +from tensorflow_models.official.vision.image_classification.resnet import common +import multiprocessing +import os + +MAIN_MODEL_PATH = '/pvcmnt' + +# remove: duplicate function from keras_utils +def set_session_config(enable_xla=False): + """Sets the session config.""" + if enable_xla: + tf.config.optimizer.set_jit(True) + +# remove: duplicate function from keras_utils +def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, + num_gpus, per_gpu_thread_count): + """Set GPU thread mode and count, and adjust dataset threads count.""" + cpu_count = multiprocessing.cpu_count() + logging.info('Logical CPU cores: %s', cpu_count) + + # Allocate private thread pool for each GPU to schedule and launch kernels + per_gpu_thread_count = per_gpu_thread_count or 2 + os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode + os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count) + logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT']) + logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE']) + + # Limit data preprocessing threadpool to CPU cores minus number of total GPU + # private threads and memory copy threads. + total_gpu_thread_count = per_gpu_thread_count * num_gpus + num_runtime_threads = num_gpus + if not datasets_num_private_threads: + datasets_num_private_threads = min( + cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8) + logging.info('Set datasets_num_private_threads to %s', + datasets_num_private_threads) + +def _is_chief(task_type, task_id): + # If `task_type` is None, this may be operating as single worker, which works + # effectively as chief. + return task_type is None or task_type == 'chief' or ( + task_type == 'worker' and task_id == 0) + +def _get_temp_dir(task_id): + base_dirpath = 'workertemp_' + str(task_id) + temp_dir = os.path.join("/tmp", base_dirpath) + os.makedirs(temp_dir) + return temp_dir + +def write_filepath(strategy): + task_type, task_id = strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id + if not _is_chief(task_type, task_id): + checkpoint_dir = _get_temp_dir(task_id) + else: + base_dirpath = 'workertemp_' + str(task_id) + checkpoint_dir = os.path.join(MAIN_MODEL_PATH, base_dirpath) + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + return checkpoint_dir + + + +LR_SCHEDULE = [ # (multiplier, epoch to start) tuples + (0.1, 91), (0.01, 136), (0.001, 182) +] + + +def learning_rate_schedule(current_epoch, + current_batch, + batches_per_epoch, + batch_size): + """Handles linear scaling rule and LR decay. + Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the + provided scaling factor. + Args: + current_epoch: integer, current epoch indexed from 0. + current_batch: integer, current batch in the current epoch, indexed from 0. + batches_per_epoch: integer, number of steps in an epoch. + batch_size: integer, total batch sized. + Returns: + Adjusted learning rate. + """ + del current_batch, batches_per_epoch # not used + initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128 + learning_rate = initial_learning_rate + for mult, start_epoch in LR_SCHEDULE: + if current_epoch >= start_epoch: + learning_rate = initial_learning_rate * mult + else: + break + return learning_rate + + +class LearningRateBatchScheduler(tf.keras.callbacks.Callback): + """Callback to update learning rate on every batch (not epoch boundaries). + N.B. Only support Keras optimizers, not TF optimizers. + Attributes: + schedule: a function that takes an epoch index and a batch index as input + (both integer, indexed from 0) and returns a new learning rate as + output (float). + """ + + def __init__(self, schedule, batch_size, steps_per_epoch): + super(LearningRateBatchScheduler, self).__init__() + self.schedule = schedule + self.steps_per_epoch = steps_per_epoch + self.batch_size = batch_size + self.epochs = -1 + self.prev_lr = -1 + + def on_epoch_begin(self, epoch, logs=None): + if not hasattr(self.model.optimizer, 'learning_rate'): + raise ValueError('Optimizer must have a "learning_rate" attribute.') + self.epochs += 1 + + def on_batch_begin(self, batch, logs=None): + """Executes before step begins.""" + lr = self.schedule(self.epochs, + batch, + self.steps_per_epoch, + self.batch_size) + if not isinstance(lr, (float, np.float32, np.float64)): + raise ValueError('The output of the "schedule" function should be float.') + if lr != self.prev_lr: + self.model.optimizer.learning_rate = lr # lr should be a float here + self.prev_lr = lr + logging.debug( + 'Epoch %05d Batch %05d: LearningRateBatchScheduler ' + 'change learning rate to %s.', self.epochs, batch, lr) + + +def run(flags_obj): + """Run ResNet Cifar-10 training and eval loop using native Keras APIs. + Args: + flags_obj: An object containing parsed flag values. + Raises: + ValueError: If fp16 is passed as it is not currently supported. + Returns: + Dictionary of training and eval stats. + """ + #keras_utils.set_session_config( + # enable_xla=flags_obj.enable_xla) + set_session_config(enable_xla=True) + + # Execute flag override logic for better model performance + """ + if flags_obj.tf_gpu_thread_mode: + keras_utils.set_gpu_thread_mode_and_count( + per_gpu_thread_count=flags_obj.per_gpu_thread_count, + gpu_thread_mode=flags_obj.tf_gpu_thread_mode, + num_gpus=flags_obj.num_gpus, + datasets_num_private_threads=flags_obj.datasets_num_private_threads) + """ + if flags_obj.tf_gpu_thread_mode: + set_gpu_thread_mode_and_count( + per_gpu_thread_count=flags_obj.per_gpu_thread_count, + gpu_thread_mode=flags_obj.tf_gpu_thread_mode, + num_gpus=flags_obj.num_gpus, + datasets_num_private_threads=flags_obj.datasets_num_private_threads) + + common.set_cudnn_batchnorm_mode() + + dtype = flags_core.get_tf_dtype(flags_obj) + if dtype == 'fp16': + raise ValueError('dtype fp16 is not supported in Keras. Use the default ' + 'value(fp32).') + + data_format = flags_obj.data_format + if data_format is None: + data_format = ('channels_first' if tf.config.list_physical_devices('GPU') + else 'channels_last') + tf.keras.backend.set_image_data_format(data_format) + + """ + strategy = distribute_utils.get_distribution_strategy( + distribution_strategy=flags_obj.distribution_strategy, + num_gpus=flags_obj.num_gpus, + all_reduce_alg=flags_obj.all_reduce_alg, + num_packs=flags_obj.num_packs) + """ + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + if strategy: + # flags_obj.enable_get_next_as_optional controls whether enabling + # get_next_as_optional behavior in DistributedIterator. If true, last + # partial batch can be supported. + strategy.extended.experimental_enable_get_next_as_optional = ( + flags_obj.enable_get_next_as_optional + ) + + strategy_scope = distribute_utils.get_strategy_scope(strategy) + + if flags_obj.use_synthetic_data: + synthetic_util.set_up_synthetic_data() + input_fn = common.get_synth_input_fn( + height=cifar_preprocessing.HEIGHT, + width=cifar_preprocessing.WIDTH, + num_channels=cifar_preprocessing.NUM_CHANNELS, + num_classes=cifar_preprocessing.NUM_CLASSES, + dtype=flags_core.get_tf_dtype(flags_obj), + drop_remainder=True) + else: + synthetic_util.undo_set_up_synthetic_data() + input_fn = cifar_preprocessing.input_fn + + train_input_dataset = input_fn( + is_training=True, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + parse_record_fn=cifar_preprocessing.parse_record, + datasets_num_private_threads=flags_obj.datasets_num_private_threads, + dtype=dtype, + # Setting drop_remainder to avoid the partial batch logic in normalization + # layer, which triggers tf.where and leads to extra memory copy of input + # sizes between host and GPU. + drop_remainder=(not flags_obj.enable_get_next_as_optional)) + + eval_input_dataset = None + if not flags_obj.skip_eval: + eval_input_dataset = input_fn( + is_training=False, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + parse_record_fn=cifar_preprocessing.parse_record) + + steps_per_epoch = ( + cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) + lr_schedule = 0.1 + if flags_obj.use_tensor_lr: + initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128 + lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( + boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE), + values=[initial_learning_rate] + + list(p[0] * initial_learning_rate for p in LR_SCHEDULE)) + + with strategy_scope: + optimizer = common.get_optimizer(lr_schedule) + model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES) + model.compile( + loss='sparse_categorical_crossentropy', + optimizer=optimizer, + metrics=(['sparse_categorical_accuracy'] + if flags_obj.report_accuracy_metrics else None), + run_eagerly=flags_obj.run_eagerly) + + train_epochs = flags_obj.train_epochs + + callbacks = common.get_callbacks() + + if not flags_obj.use_tensor_lr: + lr_callback = LearningRateBatchScheduler( + schedule=learning_rate_schedule, + batch_size=flags_obj.batch_size, + steps_per_epoch=steps_per_epoch) + callbacks.append(lr_callback) + + tensorboard_callback = tf.keras.callbacks.TensorBoard( + log_dir="gs://shankgan-tf-exp-train-log-dir/") + callbacks.append(tensorboard_callback) + + # if mutliple epochs, ignore the train_steps flag. + if train_epochs <= 1 and flags_obj.train_steps: + steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) + train_epochs = 1 + + num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // + flags_obj.batch_size) + + validation_data = eval_input_dataset + if flags_obj.skip_eval: + if flags_obj.set_learning_phase_to_train: + # TODO(haoyuzhang): Understand slowdown of setting learning phase when + # not using distribution strategy. + tf.keras.backend.set_learning_phase(1) + num_eval_steps = None + validation_data = None + + if not strategy and flags_obj.explicit_gpu_placement: + # TODO(b/135607227): Add device scope automatically in Keras training loop + # when not using distribition strategy. + no_dist_strat_device = tf.device('/device:GPU:0') + no_dist_strat_device.__enter__() + + logging.info("Beginning to fit the model.....") + history = model.fit(train_input_dataset, + epochs=train_epochs, + steps_per_epoch=steps_per_epoch, + callbacks=callbacks, + validation_steps=num_eval_steps, + validation_data=validation_data, + validation_freq=flags_obj.epochs_between_evals, + verbose=2) + eval_output = None + if not flags_obj.skip_eval: + eval_output = model.evaluate(eval_input_dataset, + steps=num_eval_steps, + verbose=2) + + if not strategy and flags_obj.explicit_gpu_placement: + no_dist_strat_device.__exit__() + + stats = common.build_stats(history, eval_output, callbacks) + return stats + + +def define_cifar_flags(): + + common.define_keras_flags() + data_dir = os.getenv("DATA_DIR") + model_dir = os.getenv("MODEL_DIR") + batch_size = int(os.getenv("BATCH_SIZE", default=512)) + num_train_epoch = int(os.getenv("NUM_TRAIN_EPOCH", default=100)) + + if not data_dir or not model_dir: + raise Exception("Data directory and Model Directory need to be specified!") + + flags_core.set_defaults(data_dir=data_dir, + model_dir=model_dir, + train_epochs=num_train_epoch, + epochs_between_evals=20, + batch_size=batch_size, + use_synthetic_data=False) # Changed the batch size + +def main(_): + return run(flags.FLAGS) + + +if __name__ == '__main__': + logging.set_verbosity(logging.INFO) + define_cifar_flags() + app.run(main) \ No newline at end of file diff --git a/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/EnhancedMultiWorkerMirroredTemplate.j2 b/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/EnhancedMultiWorkerMirroredTemplate.j2 new file mode 100644 index 00000000..8ea5e5ab --- /dev/null +++ b/distribution_strategy/multi_worker_mirrored_strategy/kubernetes/EnhancedMultiWorkerMirroredTemplate.j2 @@ -0,0 +1,142 @@ +{%- set name = "" -%} +{%- set image = "" -%} +{%- set worker_replicas = 2 -%} +{%- set script = "" -%} +{%- set gcp_credential_secret = "" %} +{%- set log_dir = "" %} +{%- set data_dir = "" %} +{%- set model_dir = "" %} +{%- set batch_size = 256 %} +{%- set num_train_epoch = 100 %} +{%- set port = 5000 -%} +{%- set run_tensorboard = true %} + + +{%- macro worker_hosts() -%} + {%- for i in range(worker_replicas) -%} + {%- if not loop.first -%},{%- endif -%} + "{{ name }}-worker-{{ i }}:{{ port }}" + {%- endfor -%} +{%- endmacro -%} + +{%- for i in range(worker_replicas) -%} +kind: Service +apiVersion: v1 +metadata: + name: {{ name }}-worker-{{ i }} +spec: + selector: + name: {{ name }} + job: worker + task: "{{ i }}" + ports: + - port: {{ port }} +--- +kind: Job +apiVersion: batch/v1 +metadata: + name: {{ name }}-worker-{{ i }} +spec: + ttlSecondsAfterFinished: 600 + template: + metadata: + labels: + name: {{ name }} + job: worker + task: "{{ i }}" + spec: + restartPolicy: Never + containers: + - name: tensorflow + image: {{ image }} + ports: + - containerPort: {{ port }} + command: + - "python" + - "{{ script }}" + env: + - name: TF_CONFIG + value: '{"cluster": {"worker": [{{ worker_hosts() }}]}, "task": {"type": "worker", "index": {{ i }}}}' + - name: GOOGLE_APPLICATION_CREDENTIALS + value: "/var/secrets/google/key.json" + - name: DATA_DIR + value: "{{ data_dir }}" + - name: MODEL_DIR + value: "{{ model_dir }}" + - name: NUM_TRAIN_EPOCH + value: "{{ num_train_epoch }}" + - name: BATCH_SIZE + value: "{{ batch_size }}" + ports: + - containerPort: {{ port }} + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: credential + mountPath: /var/secrets/google + volumes: + - name: credential + secret: + secretName: {{ gcp_credential_secret }} +--- +{% endfor %} + +{% if run_tensorboard %} +kind: Service +apiVersion: v1 +metadata: + name: resnet-tensorboard-0 +spec: + type: LoadBalancer + selector: + name: resnet + job: tensorboard + task: "0" + ports: + - port: {{ port }} +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: resnet-tensorboard-0 +spec: + replicas: 1 + selector: + matchLabels: + name: resnet + job: tensorboard + task: "0" + template: + metadata: + labels: + name: resnet + job: tensorboard + task: "0" + spec: + containers: + - name: tensorflow + image: tensorflow/tensorflow + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: "/var/secrets/google/key.json" + ports: + - containerPort: {{ port }} + command: + - "tensorboard" + args: + - '--logdir= {{ log_dir }}' + - "--port={{ port }}" + - "--host=0.0.0.0" + volumeMounts: + - name: credential + mountPath: /var/secrets/google + volumes: + - name: credential + secret: + secretName: {{ gcp_credential_secret }} +--- +{% endif %} + + +