8-hyperparam-sweep/solution-chart/templates/deployment.yaml


# First we copy the values of values.yaml in variable to make it easier to access them
{{- $lrlist := .Values.hyperParamValues.learningRate -}}
{{- $nblayerslist := .Values.hyperParamValues.hiddenLayers -}}
{{- $image := .Values.image -}}
{{- $useGPU := .Values.useGPU -}}
{{- $chartname := .Chart.Name -}}
{{- $chartversion := .Chart.Version -}}

# Then we loop over every value of $lrlist (learning rate) and $nblayerslist (hidden layer depth)
# This will result in create 1 TFJob for every pair of learning rate and hidden layer depth
{{- range $i, $lr := $lrlist }}
{{- range $j, $nblayers := $nblayerslist }}
apiVersion: kubeflow.org/v1beta1
kind: TFJob # Each one of our trainings will be a separate TFJob
metadata:
  name: module8-tf-paint-{{ $i }}-{{ $j }} # We give a unique name to each training
  labels:
    chart: "{{ $chartname }}-{{ $chartversion | replace "+" "_" }}"
spec:
  tfReplicaSpecs:
    MASTER:
      template:
        spec:
          restartPolicy: OnFailure
          containers:
            - name: tensorflow
              image: {{ $image }}
              env:
              - name: LC_ALL
                value: C.UTF-8
              args:
                # Here we pass a unique learning rate and hidden layer count to each instance.
                # We also put the values between quotes to avoid potential formatting issues
                - --learning-rate
                - {{ $lr | quote }}
                - --hidden-layers
                - {{ $nblayers | quote }}
                - --logdir
                - /tmp/tensorflow/tf-paint-lr{{ $lr }}-d-{{ $nblayers }} # We save the summaries in a different directory
{{ if $useGPU }}  # We only want to request GPUs if we asked for it in values.yaml with useGPU
              resources:
                limits:
                  nvidia.com/gpu: 1
{{ end }}
              volumeMounts:
              - mountPath: /tmp/tensorflow
                subPath: module8-tf-paint # As usual we want to save everything in a separate subdirectory
                name: azurefile
          volumes:
            - name: azurefile
              persistentVolumeClaim:
                claimName: azurefile
---
{{- end }}
{{- end }}
# We only want one instance running for all our jobs, and not 1 per job.
apiVersion: v1
kind: Service
metadata:
  labels:
    app: tensorboard
  name: module8-tensorboard
spec:
  ports:
  - port: 80
    targetPort: 6006
  selector:
    app: tensorboard
  type: LoadBalancer
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  labels:
    app: tensorboard
  name: module8-tensorboard
spec:
  template:
    metadata:
      labels:
        app: tensorboard
    spec:
      volumes:
        - name: azurefile
          persistentVolumeClaim:
            claimName: azurefile
      containers:
      - name: tensorboard
        command:
          - /usr/local/bin/tensorboard
          - --logdir=/tmp/tensorflow
          - --host=0.0.0.0
        image: tensorflow/tensorflow
        ports:
        - containerPort: 6006
        volumeMounts:
        - mountPath: /tmp/tensorflow
          subPath: module8-tf-paint
          name: azurefile