Skip to content

Commit 12a4896

Browse files
authored
[cherry-pick] Add the pytorch-mnist with GPU support container image (#1917)
1 parent 8dcc7d3 commit 12a4896

File tree

15 files changed

+61
-25
lines changed

15 files changed

+61
-25
lines changed

.github/workflows/publish-trial-images.yaml

+4-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ jobs:
3131
include:
3232
- trial-name: mxnet-mnist
3333
dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
34-
- trial-name: pytorch-mnist
35-
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile
34+
- trial-name: pytorch-mnist-cpu
35+
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
36+
- trial-name: pytorch-mnist-gpu
37+
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
3638
- trial-name: tf-mnist-with-summaries
3739
dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
3840
- trial-name: enas-cnn-cifar10-gpu

.github/workflows/pytorch-mnist-e2e-test.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
experiments: ${{ matrix.experiments }}
2525
training-operator: true
2626
# Comma Delimited
27-
trial-images: pytorch-mnist
27+
trial-images: pytorch-mnist-cpu
2828

2929
strategy:
3030
fail-fast: false

docs/images-location.md

+14-3
Original file line numberDiff line numberDiff line change
@@ -273,13 +273,24 @@ The following table shows images for training containers which are used in the
273273
</tr>
274274
<tr align="center">
275275
<td>
276-
<code>docker.io/kubeflowkatib/pytorch-mnist</code>
276+
<code>docker.io/kubeflowkatib/pytorch-mnist-cpu</code>
277277
</td>
278278
<td>
279-
PyTorch MNIST example with printing metrics to the file or StdOut
279+
PyTorch MNIST example with printing metrics to the file or StdOut with CPU support
280280
</td>
281281
<td>
282-
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile">Dockerfile</a>
282+
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu">Dockerfile</a>
283+
</td>
284+
</tr>
285+
<tr align="center">
286+
<td>
287+
<code>docker.io/kubeflowkatib/pytorch-mnist-gpu</code>
288+
</td>
289+
<td>
290+
PyTorch MNIST example with printing metrics to the file or StdOut with GPU support
291+
</td>
292+
<td>
293+
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu">Dockerfile</a>
283294
</td>
284295
</tr>
285296
<tr align="center">

examples/v1beta1/early-stopping/median-stop-with-json-format.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ spec:
6262
spec:
6363
containers:
6464
- name: training-container
65-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
65+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
6666
command:
6767
- "python3"
6868
- "/opt/pytorch-mnist/mnist.py"

examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ spec:
4646
spec:
4747
containers:
4848
- name: pytorch
49-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
49+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
5050
command:
5151
- "python3"
5252
- "/opt/pytorch-mnist/mnist.py"
@@ -61,7 +61,7 @@ spec:
6161
spec:
6262
containers:
6363
- name: pytorch
64-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
64+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
6565
command:
6666
- "python3"
6767
- "/opt/pytorch-mnist/mnist.py"

examples/v1beta1/metrics-collector/custom-metrics-collector.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ spec:
6767
spec:
6868
containers:
6969
- name: training-container
70-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
70+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
7171
command:
7272
- "python3"
7373
- "/opt/pytorch-mnist/mnist.py"

examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ spec:
5252
spec:
5353
containers:
5454
- name: training-container
55-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
55+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
5656
command:
5757
- "python3"
5858
- "/opt/pytorch-mnist/mnist.py"

examples/v1beta1/metrics-collector/file-metrics-collector.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ spec:
5454
spec:
5555
containers:
5656
- name: training-container
57-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
57+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
5858
command:
5959
- "python3"
6060
- "/opt/pytorch-mnist/mnist.py"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
2+
3+
ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
4+
WORKDIR /opt/pytorch-mnist
5+
6+
# Add folder for the logs.
7+
RUN mkdir /katib
8+
RUN pip install --no-cache-dir -r requirements.txt
9+
10+
RUN chgrp -R 0 /opt/pytorch-mnist \
11+
&& chmod -R g+rwX /opt/pytorch-mnist \
12+
&& chgrp -R 0 /katib \
13+
&& chmod -R g+rwX /katib
14+
15+
ENTRYPOINT ["python3", "/opt/pytorch-mnist/mnist.py"]

manifests/v1beta1/components/controller/trial-templates.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ data:
5454
spec:
5555
containers:
5656
- name: pytorch
57-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
57+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
5858
command:
5959
- "python3"
6060
- "/opt/pytorch-mnist/mnist.py"
@@ -68,7 +68,7 @@ data:
6868
spec:
6969
containers:
7070
- name: pytorch
71-
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
71+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
7272
command:
7373
- "python3"
7474
- "/opt/pytorch-mnist/mnist.py"

scripts/v1beta1/build.sh

+5-2
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,11 @@ else
123123
echo -e "\nBuilding mxnet mnist training container example...\n"
124124
docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile .
125125

126-
echo -e "\nBuilding PyTorch mnist training container example...\n"
127-
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile .
126+
echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n"
127+
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu .
128+
129+
echo -e "\nBuilding PyTorch mnist training container example with GPU support...\n"
130+
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-gpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.gpu .
128131

129132
echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
130133
docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu .

scripts/v1beta1/push.sh

+5-2
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,11 @@ docker push "${REGISTRY}/mxnet-mnist:${TAG}"
9898
echo -e "\nPushing Tensorflow with summaries mnist training container example...\n"
9999
docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}"
100100

101-
echo -e "\nPushing PyTorch mnist training container example...\n"
102-
docker push "${REGISTRY}/pytorch-mnist:${TAG}"
101+
echo -e "\nPushing PyTorch mnist training container example with CPU support...\n"
102+
docker push "${REGISTRY}/pytorch-mnist-cpu:${TAG}"
103+
104+
echo -e "\nPushing PyTorch mnist training container example with GPU support...\n"
105+
docker push "${REGISTRY}/pytorch-mnist-gpu:${TAG}"
103106

104107
echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
105108
docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}"

scripts/v1beta1/update-images.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\""
8383

8484
# Postfixes for the each Trial image.
8585
MXNET_MNIST="mxnet-mnist"
86-
PYTORCH_MNIST="pytorch-mnist"
86+
PYTORCH_MNIST_CPU="pytorch-mnist-cpu"
87+
PYTORCH_MNIST_GPU="pytorch-mnist-gpu"
8788
TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries"
8889
ENAS_GPU="enas-cnn-cifar10-gpu"
8990
ENAS_CPU="enas-cnn-cifar10-cpu"
@@ -93,7 +94,8 @@ SIMPLE_PBT="simple-pbt"
9394

9495
echo -e "Update Katib Trial training container images\n"
9596
update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}"
96-
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}"
97+
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}"
98+
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}"
9799
update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}"
98100
update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}"
99101
update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}"

test/e2e/v1beta1/scripts/gh-actions/build-load.sh

+5-5
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ REGISTRY="docker.io/kubeflowkatib"
3030
TAG="e2e-test"
3131
VERSION="v1beta1"
3232
CMD_PREFIX="cmd"
33-
SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu")
33+
SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu" "pytorch-mnist-cpu")
3434

3535
IFS="," read -r -a TRIAL_IMAGE_ARRAY <<< "$TRIAL_IMAGES"
3636
IFS="," read -r -a EXPERIMENT_ARRAY <<< "$EXPERIMENTS"
@@ -51,7 +51,7 @@ _build_containers() {
5151
docker build --platform "$(uname -m)" -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../
5252
}
5353

54-
_load_kind_cluster() {
54+
_load_minikube_cluster() {
5555
CONTAINER_NAME=${1:-"katib-controller"}
5656

5757
echo -e "\n\nLoading $CONTAINER_NAME image...\n\n"
@@ -99,7 +99,7 @@ run() {
9999
for s in "${suggestions[@]}"; do
100100
if [ "$s" == "$CONTAINER_NAME" ]; then
101101
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
102-
_load_kind_cluster "$CONTAINER_NAME"
102+
_load_minikube_cluster "$CONTAINER_NAME"
103103
break
104104
fi
105105
done
@@ -126,15 +126,15 @@ run() {
126126
for e in "${earlystoppings[@]}"; do
127127
if [ "$e" == "$CONTAINER_NAME" ]; then
128128
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
129-
_load_kind_cluster "$CONTAINER_NAME"
129+
_load_minikube_cluster "$CONTAINER_NAME"
130130
break
131131
fi
132132
done
133133

134134
# Others
135135
else
136136
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
137-
_load_kind_cluster "$CONTAINER_NAME"
137+
_load_minikube_cluster "$CONTAINER_NAME"
138138
fi
139139
}
140140

0 commit comments

Comments
 (0)