This repository has been archived by the owner on Oct 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 97
/
deployment.yaml
100 lines (98 loc) · 3.08 KB
/
deployment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# First we copy the values of values.yaml in variable to make it easier to access them
{{- $lrlist := .Values.hyperParamValues.learningRate -}}
{{- $nblayerslist := .Values.hyperParamValues.hiddenLayers -}}
{{- $image := .Values.image -}}
{{- $useGPU := .Values.useGPU -}}
{{- $chartname := .Chart.Name -}}
{{- $chartversion := .Chart.Version -}}
# Then we loop over every value of $lrlist (learning rate) and $nblayerslist (hidden layer depth)
# This will result in create 1 TFJob for every pair of learning rate and hidden layer depth
{{- range $i, $lr := $lrlist }}
{{- range $j, $nblayers := $nblayerslist }}
apiVersion: kubeflow.org/v1beta1
kind: TFJob # Each one of our trainings will be a separate TFJob
metadata:
name: module8-tf-paint-{{ $i }}-{{ $j }} # We give a unique name to each training
labels:
chart: "{{ $chartname }}-{{ $chartversion | replace "+" "_" }}"
spec:
tfReplicaSpecs:
MASTER:
template:
spec:
restartPolicy: OnFailure
containers:
- name: tensorflow
image: {{ $image }}
env:
- name: LC_ALL
value: C.UTF-8
args:
# Here we pass a unique learning rate and hidden layer count to each instance.
# We also put the values between quotes to avoid potential formatting issues
- --learning-rate
- {{ $lr | quote }}
- --hidden-layers
- {{ $nblayers | quote }}
- --logdir
- /tmp/tensorflow/tf-paint-lr{{ $lr }}-d-{{ $nblayers }} # We save the summaries in a different directory
{{ if $useGPU }} # We only want to request GPUs if we asked for it in values.yaml with useGPU
resources:
limits:
nvidia.com/gpu: 1
{{ end }}
volumeMounts:
- mountPath: /tmp/tensorflow
subPath: module8-tf-paint # As usual we want to save everything in a separate subdirectory
name: azurefile
volumes:
- name: azurefile
persistentVolumeClaim:
claimName: azurefile
---
{{- end }}
{{- end }}
# We only want one instance running for all our jobs, and not 1 per job.
apiVersion: v1
kind: Service
metadata:
labels:
app: tensorboard
name: module8-tensorboard
spec:
ports:
- port: 80
targetPort: 6006
selector:
app: tensorboard
type: LoadBalancer
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
app: tensorboard
name: module8-tensorboard
spec:
template:
metadata:
labels:
app: tensorboard
spec:
volumes:
- name: azurefile
persistentVolumeClaim:
claimName: azurefile
containers:
- name: tensorboard
command:
- /usr/local/bin/tensorboard
- --logdir=/tmp/tensorflow
- --host=0.0.0.0
image: tensorflow/tensorflow
ports:
- containerPort: 6006
volumeMounts:
- mountPath: /tmp/tensorflow
subPath: module8-tf-paint
name: azurefile