6_get_data_train_upload.yaml

apiVersion: tekton.dev/v1beta1
kind: PipelineRun
metadata:
  name: train-upload-stock-kfp
  annotations:
    tekton.dev/output_artifacts: '{"get-data": [{"key": "artifacts/$PIPELINERUN/get-data/train_data_output.tgz",
      "name": "get-data-train_data_output", "path": "/tmp/outputs/train_data_output/data"},
      {"key": "artifacts/$PIPELINERUN/get-data/validate_data_output.tgz", "name":
      "get-data-validate_data_output", "path": "/tmp/outputs/validate_data_output/data"}],
      "train-model": [{"key": "artifacts/$PIPELINERUN/train-model/model_output.tgz",
      "name": "train-model-model_output", "path": "/tmp/outputs/model_output/data"}]}'
    tekton.dev/input_artifacts: '{"train-model": [{"name": "get-data-train_data_output",
      "parent_task": "get-data"}, {"name": "get-data-validate_data_output", "parent_task":
      "get-data"}], "upload-model": [{"name": "train-model-model_output", "parent_task":
      "train-model"}]}'
    tekton.dev/artifact_bucket: mlpipeline
    tekton.dev/artifact_endpoint: minio-service.kubeflow:9000
    tekton.dev/artifact_endpoint_scheme: http://
    tekton.dev/artifact_items: '{"get-data": [["train_data_output", "$(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/train_data_output"],
      ["validate_data_output", "$(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/validate_data_output"]],
      "train-model": [["model_output", "$(workspaces.train-model.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/model_output"]],
      "upload-model": []}'
    sidecar.istio.io/inject: "false"
    tekton.dev/template: ''
    pipelines.kubeflow.org/big_data_passing_format: $(workspaces.$TASK_NAME.path)/artifacts/$ORIG_PR_NAME/$TASKRUN_NAME/$TASK_PARAM_NAME
    pipelines.kubeflow.org/pipeline_spec: '{"name": "train_upload_stock_kfp"}'
  labels:
    pipelines.kubeflow.org/pipelinename: ''
    pipelines.kubeflow.org/generation: ''
spec:
  pipelineSpec:
    tasks:
    - name: get-data
      taskSpec:
        steps:
        - name: main
          args:
          - --train-data-output
          - $(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/train_data_output
          - --validate-data-output
          - $(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/validate_data_output
          command:
          - sh
          - -ec
          - |
            program_path=$(mktemp)
            printf "%s" "$0" > "$program_path"
            python3 -u "$program_path" "$@"
          - |
            def _make_parent_dirs_and_return_path(file_path: str):
                import os
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                return file_path

            def get_data(train_data_output_path, validate_data_output_path):
                import urllib.request
                print("starting download...")
                print("downloading training data")
                url = "https://raw.githubusercontent.com/nerc-project/fraud-detection/main/data/train.csv"
                urllib.request.urlretrieve(url, train_data_output_path)
                print("train data downloaded")
                print("downloading validation data")
                url = "https://raw.githubusercontent.com/nerc-project/fraud-detection/main/data/validate.csv"
                urllib.request.urlretrieve(url, validate_data_output_path)
                print("validation data downloaded")

            import argparse
            _parser = argparse.ArgumentParser(prog='Get data', description='')
            _parser.add_argument("--train-data-output", dest="train_data_output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
            _parser.add_argument("--validate-data-output", dest="validate_data_output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
            _parsed_args = vars(_parser.parse_args())

            _outputs = get_data(**_parsed_args)
          image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2023a-20230817-b7e647e
          env:
          - name: ORIG_PR_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['custom.tekton.dev/originalPipelineRun']
        - image: busybox
          name: output-taskrun-name
          command:
          - sh
          - -ec
          - echo -n "$(context.taskRun.name)" > "$(results.taskrun-name.path)"
        - image: busybox
          name: copy-results-artifacts
          command:
          - sh
          - -ec
          - |
            set -exo pipefail
            TOTAL_SIZE=0
            copy_artifact() {
            if [ -d "$1" ]; then
              tar -czvf "$1".tar.gz "$1"
              SUFFIX=".tar.gz"
            fi
            ARTIFACT_SIZE=`wc -c "$1"${SUFFIX} | awk '{print $1}'`
            TOTAL_SIZE=$( expr $TOTAL_SIZE + $ARTIFACT_SIZE)
            touch "$2"
            if [[ $TOTAL_SIZE -lt 3072 ]]; then
              if [ -d "$1" ]; then
                tar -tzf "$1".tar.gz > "$2"
              elif ! awk "/[^[:print:]]/{f=1} END{exit !f}" "$1"; then
                cp "$1" "$2"
              fi
            fi
            }
            copy_artifact $(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/train_data_output $(results.train-data-output.path)
            copy_artifact $(workspaces.get-data.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/validate_data_output $(results.validate-data-output.path)
          onError: continue
          env:
          - name: ORIG_PR_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['custom.tekton.dev/originalPipelineRun']
        results:
        - name: taskrun-name
          type: string
        - name: train-data-output
          type: string
          description: /tmp/outputs/train_data_output/data
        - name: validate-data-output
          type: string
          description: /tmp/outputs/validate_data_output/data
        metadata:
          labels:
            pipelines.kubeflow.org/cache_enabled: "true"
          annotations:
            pipelines.kubeflow.org/component_spec_digest: '{"name": "Get data", "outputs":
              [{"name": "train_data_output"}, {"name": "validate_data_output"}], "version":
              "Get data@sha256=6c8ed9096811d00a434cb3e3ac4af2c8ef37d0f6d0f27b8ec6c74e2b88547e9c"}'
        workspaces:
        - name: get-data
      workspaces:
      - name: get-data
        workspace: train-upload-stock-kfp
    - name: train-model
      params:
      - name: get-data-trname
        value: $(tasks.get-data.results.taskrun-name)
      taskSpec:
        steps:
        - name: main
          args:
          - --train-data-input
          - $(workspaces.train-model.path)/artifacts/$ORIG_PR_NAME/$(params.get-data-trname)/train_data_output
          - --validate-data-input
          - $(workspaces.train-model.path)/artifacts/$ORIG_PR_NAME/$(params.get-data-trname)/validate_data_output
          - --model-output
          - $(workspaces.train-model.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/model_output
          command:
          - sh
          - -c
          - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
            'tf2onnx' 'seaborn' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip
            install --quiet --no-warn-script-location 'tf2onnx' 'seaborn' --user)
            && "$0" "$@"
          - sh
          - -ec
          - |
            program_path=$(mktemp)
            printf "%s" "$0" > "$program_path"
            python3 -u "$program_path" "$@"
          - |
            def _make_parent_dirs_and_return_path(file_path: str):
                import os
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                return file_path

            def train_model(train_data_input_path, validate_data_input_path, model_output_path):
                import numpy as np
                import pandas as pd
                from keras.models import Sequential
                from keras.layers import Dense, Dropout, BatchNormalization, Activation
                from sklearn.model_selection import train_test_split
                from sklearn.preprocessing import StandardScaler
                from sklearn.utils import class_weight
                import tf2onnx
                import onnx
                import pickle
                from pathlib import Path

                # Load the CSV data which we will use to train the model.
                # It contains the following fields:
                #   distancefromhome - The distance from home where the transaction happened.
                #   distancefromlast_transaction - The distance from last transaction happened.
                #   ratiotomedianpurchaseprice - Ratio of purchased price compared to median purchase price.
                #   repeat_retailer - If it's from a retailer that already has been purchased from before.
                #   used_chip - If the (credit card) chip was used.
                #   usedpinnumber - If the PIN number was used.
                #   online_order - If it was an online order.
                #   fraud - If the transaction is fraudulent.

                feature_indexes = [
                    1,  # distance_from_last_transaction
                    2,  # ratio_to_median_purchase_price
                    4,  # used_chip
                    5,  # used_pin_number
                    6,  # online_order
                ]

                label_indexes = [
                    7  # fraud
                ]

                X_train = pd.read_csv(train_data_input_path)
                y_train = X_train.iloc[:, label_indexes]
                X_train = X_train.iloc[:, feature_indexes]

                X_val = pd.read_csv(validate_data_input_path)
                y_val = X_val.iloc[:, label_indexes]
                X_val = X_val.iloc[:, feature_indexes]

                # Scale the data to remove mean and have unit variance. The data will be between -1 and 1, which makes it a lot easier for the model to learn than random (and potentially large) values.
                # It is important to only fit the scaler to the training data, otherwise you are leaking information about the global distribution of variables (which is influenced by the test set) into the training set.

                scaler = StandardScaler()

                X_train = scaler.fit_transform(X_train.values)

                Path("artifact").mkdir(parents=True, exist_ok=True)
                with open("artifact/scaler.pkl", "wb") as handle:
                    pickle.dump(scaler, handle)

                # Since the dataset is unbalanced (it has many more non-fraud transactions than fraudulent ones), set a class weight to weight the few fraudulent transactions higher than the many non-fraud transactions.
                class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.values.ravel())
                class_weights = {i: class_weights[i] for i in range(len(class_weights))}

                # Build the model, the model we build here is a simple fully connected deep neural network, containing 3 hidden layers and one output layer.

                model = Sequential()
                model.add(Dense(32, activation='relu', input_dim=len(feature_indexes)))
                model.add(Dropout(0.2))
                model.add(Dense(32))
                model.add(BatchNormalization())
                model.add(Activation('relu'))
                model.add(Dropout(0.2))
                model.add(Dense(32))
                model.add(BatchNormalization())
                model.add(Activation('relu'))
                model.add(Dropout(0.2))
                model.add(Dense(1, activation='sigmoid'))
                model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
                model.summary()

                # Train the model and get performance

                epochs = 2
                history = model.fit(X_train, y_train, epochs=epochs,
                                    validation_data=(scaler.transform(X_val.values), y_val),
                                    verbose=True, class_weight=class_weights)

                # Save the model as ONNX for easy use of ModelMesh
                model_proto, _ = tf2onnx.convert.from_keras(model)
                print(model_output_path)
                onnx.save(model_proto, model_output_path)

            import argparse
            _parser = argparse.ArgumentParser(prog='Train model', description='')
            _parser.add_argument("--train-data-input", dest="train_data_input_path", type=str, required=True, default=argparse.SUPPRESS)
            _parser.add_argument("--validate-data-input", dest="validate_data_input_path", type=str, required=True, default=argparse.SUPPRESS)
            _parser.add_argument("--model-output", dest="model_output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)
            _parsed_args = vars(_parser.parse_args())

            _outputs = train_model(**_parsed_args)
          image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2023a-20230817-b7e647e
          env:
          - name: ORIG_PR_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['custom.tekton.dev/originalPipelineRun']
        - image: busybox
          name: output-taskrun-name
          command:
          - sh
          - -ec
          - echo -n "$(context.taskRun.name)" > "$(results.taskrun-name.path)"
        - image: busybox
          name: copy-results-artifacts
          command:
          - sh
          - -ec
          - |
            set -exo pipefail
            TOTAL_SIZE=0
            copy_artifact() {
            if [ -d "$1" ]; then
              tar -czvf "$1".tar.gz "$1"
              SUFFIX=".tar.gz"
            fi
            ARTIFACT_SIZE=`wc -c "$1"${SUFFIX} | awk '{print $1}'`
            TOTAL_SIZE=$( expr $TOTAL_SIZE + $ARTIFACT_SIZE)
            touch "$2"
            if [[ $TOTAL_SIZE -lt 3072 ]]; then
              if [ -d "$1" ]; then
                tar -tzf "$1".tar.gz > "$2"
              elif ! awk "/[^[:print:]]/{f=1} END{exit !f}" "$1"; then
                cp "$1" "$2"
              fi
            fi
            }
            copy_artifact $(workspaces.train-model.path)/artifacts/$ORIG_PR_NAME/$(context.taskRun.name)/model_output $(results.model-output.path)
          onError: continue
          env:
          - name: ORIG_PR_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['custom.tekton.dev/originalPipelineRun']
        params:
        - name: get-data-trname
        results:
        - name: model-output
          type: string
          description: /tmp/outputs/model_output/data
        - name: taskrun-name
          type: string
        metadata:
          labels:
            pipelines.kubeflow.org/cache_enabled: "true"
          annotations:
            pipelines.kubeflow.org/component_spec_digest: '{"name": "Train model",
              "outputs": [{"name": "model_output"}], "version": "Train model@sha256=3f449c19f1645e6474d91f06763d265c5a7654ed6fb47d7e3cb7952fda6886fe"}'
        workspaces:
        - name: train-model
      workspaces:
      - name: train-model
        workspace: train-upload-stock-kfp
      runAfter:
      - get-data
      - get-data
    - name: upload-model
      params:
      - name: train-model-trname
        value: $(tasks.train-model.results.taskrun-name)
      taskSpec:
        steps:
        - name: main
          args:
          - --input-model
          - $(workspaces.upload-model.path)/artifacts/$ORIG_PR_NAME/$(params.train-model-trname)/model_output
          command:
          - sh
          - -c
          - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location
            'boto3' 'botocore' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install
            --quiet --no-warn-script-location 'boto3' 'botocore' --user) && "$0" "$@"
          - sh
          - -ec
          - |
            program_path=$(mktemp)
            printf "%s" "$0" > "$program_path"
            python3 -u "$program_path" "$@"
          - |
            def upload_model(input_model_path):
                import os
                import boto3
                import botocore

                aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
                aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
                endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
                region_name = os.environ.get('AWS_DEFAULT_REGION')
                bucket_name = os.environ.get('AWS_S3_BUCKET')

                s3_key = os.environ.get("S3_KEY")

                session = boto3.session.Session(aws_access_key_id=aws_access_key_id,
                                                aws_secret_access_key=aws_secret_access_key)

                s3_resource = session.resource(
                    's3',
                    config=botocore.client.Config(signature_version='s3v4'),
                    endpoint_url=endpoint_url,
                    region_name=region_name)

                bucket = s3_resource.Bucket(bucket_name)

                print(f"Uploading {s3_key}")
                bucket.upload_file(input_model_path, s3_key)

            import argparse
            _parser = argparse.ArgumentParser(prog='Upload model', description='')
            _parser.add_argument("--input-model", dest="input_model_path", type=str, required=True, default=argparse.SUPPRESS)
            _parsed_args = vars(_parser.parse_args())

            _outputs = upload_model(**_parsed_args)
          env:
          - name: S3_KEY
            value: models/fraud/1/model.onnx
          - name: ORIG_PR_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['custom.tekton.dev/originalPipelineRun']
          envFrom:
          - secretRef:
              name: aws-connection-my-storage
          image: quay.io/modh/runtime-images:runtime-cuda-tensorflow-ubi9-python-3.9-2023a-20230817-b7e647e
        params:
        - name: train-model-trname
        metadata:
          labels:
            pipelines.kubeflow.org/cache_enabled: "true"
          annotations:
            pipelines.kubeflow.org/component_spec_digest: '{"name": "Upload model",
              "outputs": [], "version": "Upload model@sha256=9c9d2bc5a1c622ba9879077eb0f2f7d0a5d404bd655b68494bdcd2145d358421"}'
        workspaces:
        - name: upload-model
      workspaces:
      - name: upload-model
        workspace: train-upload-stock-kfp
      runAfter:
      - train-model
    workspaces:
    - name: train-upload-stock-kfp
  workspaces:
  - name: train-upload-stock-kfp
    volumeClaimTemplate:
      spec:
        storageClassName: ocs-external-storagecluster-ceph-rbd
        accessModes:
        - ReadWriteOnce
        resources:
          requests:
            storage: 2Gi