Merge pull request #59 from aisingapore/dev

0.5.0 release part 2
aisingapore · Feb 19, 2025 · cbe98cd · cbe98cd
2 parents f88022b + 2640b4a
commit cbe98cd
Show file tree

Hide file tree

Showing 12 changed files with 209 additions and 56 deletions.
diff --git a/...lem-templates/hdb/aisg-context/guide-site/docs/docker/05b-data-storage-versioning.md.diff b/...lem-templates/hdb/aisg-context/guide-site/docs/docker/05b-data-storage-versioning.md.diff
@@ -12,12 +12,20 @@
 +Coder workspace.
 +
 +[prob]: ../setting-up/02-preface.md#guides-problem-statement
-@@ -12,3 +16 @@
+@@ -6,0 +10,6 @@
++!!! info "Volume Mounts"
++
++    We will be mounting the entire repository as a volume in the docker 
++    container. Hence the following commands can be executed locally in your 
++    repository to download the data and it will mounted into the container.
++
+@@ -12,3 +22,2 @@
 -    echo "Test1" > data1.txt
 -    echo "Test2" > data2.txt
 -    echo "Test3" > data3.txt
 +    wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
-@@ -19,6 +21,4 @@
++    cd ../..
+@@ -19,6 +28,4 @@
 -    ```powershell
 -    New-Item -ItemType Directory -Path .\data\raw -Force | Out-Null
 -    Set-Location -Path .\data\raw
@@ -28,6 +36,6 @@
 +    New-Item -ItemType Directory -Force -Path "data/raw" | Out-Null
 +    Set-Location -Path "data/raw"
 +    Invoke-WebRequest -Uri "https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv" -OutFile "ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"
-@@ -28 +28 @@
+@@ -28 +35 @@
 -data and eventually 'training' a dummy model.
 +data and eventually training a resale price prediction model.
diff --git a/...}/problem-templates/hdb/aisg-context/guide-site/docs/docker/06b-job-orchestration.md.diff b/...}/problem-templates/hdb/aisg-context/guide-site/docs/docker/06b-job-orchestration.md.diff
@@ -1,6 +1,10 @@
 --- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/docker/06b-job-orchestration.md
 +++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/docker/06b-job-orchestration.md
-@@ -153,2 +153,7 @@
+@@ -39,0 +39,3 @@
++There are other configurables in the `conf/process_data.yaml` which are used
++in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
++
+@@ -153,2 +156,7 @@
 -dummy_param1: 1.3
 -dummy_param2: 0.8
 +artifact_dir_path: "./models"
@@ -10,26 +14,28 @@
 +gamma: 1
 +max_depth: 5
 +seed: 1111
-@@ -341,2 +352,2 @@
+@@ -341,2 +349,2 @@
 -    direction: ["minimize", "maximize"]
 -    study_name: "image-classification"
 +    direction: ["minimize"]
 +    study_name: "hdb-resale-process"
-@@ -347,2 +358,4 @@
+@@ -347,2 +355,4 @@
 -      dummy_param1: range(0.9,1.7,step=0.1)
 -      dummy_param2: choice(0.7,0.8,0.9)
 +      n_estimators: range(50, 200, step=10)
 +      lr: tag(log, interval(0.1, 0.6))
 +      gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
 +      max_depth: range(2,20,step=1)
-@@ -375 +388 @@
+@@ -375 +375 @@
 -    return args["dummy_param1"], args["dummy_param2"]
 +    return test_rmse ## or any other metrics
-@@ -382 +395 @@
+@@ -382 +392 @@
 -    direction: ["minimize", "maximize"]
 +    direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
-@@ -386,2 +399,2 @@
+@@ -386,3 +396,3 @@
 -In the training script the returned variables are to contain values
 -that we seek to optimise for. In this case, we seek to minimise the 
+-loss and maximise the accuracy. The `hydra.sweeper.direction` field in 
 +In the training script the returned variables are to contain **values
 +that we seek to optimise for**. In this case, we seek to minimise the 
++root mean square error. The `hydra.sweeper.direction` field in 
diff --git a/...o_name}}/problem-templates/hdb/aisg-context/guide-site/docs/local/04a-virtual-env.md.diff b/...o_name}}/problem-templates/hdb/aisg-context/guide-site/docs/local/04a-virtual-env.md.diff
@@ -8,15 +8,13 @@
 +machine to execute some of the steps of the end-to-end machine learning
 +workflow. Hence, we can begin by creating a virtual environment that
 +will contain all the dependencies required for this guide.
-@@ -14,0 +18,7 @@
-+If you have an Nvidia GPU, you can make use of the YAML configuration
-+that make use of that GPU:
+@@ -14,0 +17,5 @@
++In this exercise, the template includes training with a XGBoost Regressor model.
++The XGBoost binary packages support the GPU algorithm (device=cuda:0) on 
++machines with NVIDIA GPUs, hence no extra packages are needed (unlike Pytorch 
++in Exercise 1). We do need to create a GPU specific conda environment.
 +
-+```bash
-+conda env create -f {{cookiecutter.repo_name}}-conda-env-gpu.yaml
-+```
-+
-@@ -16,4 +26,7 @@
+@@ -16,4 +24,7 @@
 -    Conda environment configured using the YAML file does not take into
 -    account whether you need extra requirements to use your GPU for
 -    training/inference. Check the instructions on your ML/AI framework

diff --git a/...blem-templates/hdb/aisg-context/guide-site/docs/local/05a-data-storage-versioning.md.diff b/...blem-templates/hdb/aisg-context/guide-site/docs/local/05a-data-storage-versioning.md.diff
@@ -12,12 +12,13 @@
 +Coder workspace.
 +
 +[prob]: ../setting-up/02-preface.md#guides-problem-statement
-@@ -12,3 +16 @@
+@@ -12,3 +16,2 @@
 -    echo "Test1" > data1.txt
 -    echo "Test2" > data2.txt
 -    echo "Test3" > data3.txt
 +    wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
-@@ -19,6 +21,4 @@
++    cd ../..
+@@ -19,6 +22,4 @@
 -    ```powershell
 -    New-Item -ItemType Directory -Path .\data\raw -Force | Out-Null
 -    Set-Location -Path .\data\raw
@@ -28,6 +29,6 @@
 +    New-Item -ItemType Directory -Force -Path "data/raw" | Out-Null
 +    Set-Location -Path "data/raw"
 +    Invoke-WebRequest -Uri "https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv" -OutFile "ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"
-@@ -28 +28 @@
+@@ -28 +29 @@
 -data and eventually 'training' a dummy model.
 +data and eventually training a resale price prediction model.
diff --git a/...}}/problem-templates/hdb/aisg-context/guide-site/docs/local/06a-job-orchestration.md.diff b/...}}/problem-templates/hdb/aisg-context/guide-site/docs/local/06a-job-orchestration.md.diff
@@ -1,11 +1,15 @@
 --- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/local/06a-job-orchestration.md
 +++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/local/06a-job-orchestration.md
-@@ -49,4 +49,0 @@
+@@ -43,0 +43,3 @@
++There are other configurables in the `conf/process_data.yaml` which are used
++in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
++
+@@ -49,4 +52,0 @@
 -# Add no_cuda=False at the end to enable GPU use.
 -# Make sure you have installed CUDA/RoCM before using.
 -# Check that LD_LIBRARY_PATH has been set.
 -# Also set HIP_VISIBLE_DEVICES=0 if RoCM is used.
-@@ -96,2 +96,7 @@
+@@ -96,2 +95,7 @@
 -dummy_param1: 1.3
 -dummy_param2: 0.8
 +artifact_dir_path: "./models"
@@ -15,21 +19,24 @@
 +gamma: 1
 +max_depth: 5
 +seed: 1111
-@@ -166,2 +177,2 @@
+@@ -166,2 +170,2 @@
 -    direction: ["minimize", "maximize"]
 -    study_name: "image-classification"
 +    direction: ["minimize"]
 +    study_name: "hdb-resale-process"
-@@ -172,2 +183,4 @@
+@@ -172,2 +176,4 @@
 -      dummy_param1: range(0.9,1.7,step=0.1)
 -      dummy_param2: choice(0.7,0.8,0.9)
 +      n_estimators: range(50, 200, step=10)
 +      lr: tag(log, interval(0.1, 0.6))
 +      gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
 +      max_depth: range(2,20,step=1)
-@@ -200 +213 @@
+@@ -200 +206 @@
 -    return args["dummy_param1"], args["dummy_param2"]
 +    return test_rmse ## or any other metrics
-@@ -207 +220 @@
+@@ -207 +213 @@
 -    direction: ["minimize", "maximize"]
 +    direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
+@@ -213 +219 @@
+-loss and maximise the accuracy. The `hydra.sweeper.direction` field in 
++root mean square error. The `hydra.sweeper.direction` field in 
diff --git a/...blem-templates/hdb/aisg-context/guide-site/docs/runai/05c-data-storage-versioning.md.diff b/...blem-templates/hdb/aisg-context/guide-site/docs/runai/05c-data-storage-versioning.md.diff
@@ -12,21 +12,22 @@
 +[prob]: ../setting-up/02-preface.md#guides-problem-statement
 +
 +=== "Coder Workspace Terminal"
-@@ -11,4 +15,2 @@
+@@ -11,4 +16,3 @@
 -    mkdir -p /<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/data/raw && cd "$_"
 -    echo "Test1" > data1.txt
 -    echo "Test2" > data2.txt
 -    echo "Test3" > data3.txt
-+    mkdir -p ./data/raw && cd "$_"
++    mkdir -p /<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/data/raw && cd "$_"
 +    wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
-@@ -16,0 +19,7 @@
++    cd ../..
+@@ -16,0 +20,7 @@
 +!!! info
 +    The sample data for this guide's problem statement is made
 +    accessible to the public. Hence any team or individual can download
 +    it. It is highly likely that your project's data is not publicly
 +    accessible and neither should it be, especially if it is a 100E
 +    project.
 +
-@@ -18 +28 @@
+@@ -18 +29 @@
 -data and eventually 'training' a dummy model.
 +data and eventually training a resale price prediction model.
diff --git a/...}}/problem-templates/hdb/aisg-context/guide-site/docs/runai/06c-job-orchestration.md.diff b/...}}/problem-templates/hdb/aisg-context/guide-site/docs/runai/06c-job-orchestration.md.diff
@@ -1,12 +1,10 @@
 --- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/runai/06c-job-orchestration.md
 +++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/runai/06c-job-orchestration.md
-@@ -54 +54 @@
--=== "VSCode Server Terminal"
-+=== "Coder Workspace Terminal"
-@@ -73 +73 @@
--=== "VSCode Server Terminal"
-+=== "Coder Workspace Terminal"
-@@ -144,2 +144,7 @@
+@@ -49,0 +49,3 @@
++There are other configurables in the `conf/process_data.yaml` which are used
++in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
++
+@@ -144,2 +147,7 @@
 -dummy_param1: 1.3
 -dummy_param2: 0.8
 +artifact_dir_path: "./models"
@@ -16,44 +14,38 @@
 +gamma: 1
 +max_depth: 5
 +seed: 1111
-@@ -157 +168 @@
--=== "VSCode Server Terminal"
-+=== "Coder Workspace Terminal"
-@@ -176 +187 @@
--=== "VSCode Server Terminal"
-+=== "Coder Workspace Terminal"
-@@ -191,2 +202,5 @@
+@@ -191,2 +199,5 @@
 -            artifact_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/models \
 -            mlflow_tracking_uri=<MLFLOW_TRACKING_URI>"
 +            setup_mlflow=true \
 +            mlflow_tracking_uri=<MLFLOW_TRACKING_URI> \
 +            mlflow_exp_name=<NAME_OF_DEFAULT_MLFLOW_EXPERIMENT> \
 +            model_checkpoint_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/{{cookiecutter.repo_name}}/models \
 +            epochs=3"
-@@ -259,2 +273,2 @@
+@@ -259,2 +270,2 @@
 -    direction: ["minimize", "maximize"]
 -    study_name: "image-classification"
 +    direction: ["minimize"]
 +    study_name: "hdb-resale-process"
-@@ -265,2 +279,4 @@
+@@ -265,2 +276,4 @@
 -      dummy_param1: range(0.9,1.7,step=0.1)
 -      dummy_param2: choice(0.7,0.8,0.9)
 +      n_estimators: range(50, 200, step=10)
 +      lr: tag(log, interval(0.1, 0.6))
 +      gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
 +      max_depth: range(2,20,step=1)
-@@ -293 +309 @@
+@@ -293 +306 @@
 -    return args["dummy_param1"], args["dummy_param2"]
 +    return test_rmse ## or any other metrics
-@@ -300 +316 @@
+@@ -300 +313 @@
 -    direction: ["minimize", "maximize"]
 +    direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
-@@ -323 +339 @@
--=== "VSCode Server Terminal"
-+=== "Coder Workspace Terminal"
-@@ -335,0 +352 @@
+@@ -306 +319 @@
+-loss and maximise the accuracy. The `hydra.sweeper.direction` field in
++root mean square error. The `hydra.sweeper.direction` field in 
+@@ -335,0 +348 @@
 +        -e OMP_NUM_THREADS=2 \
-@@ -338,2 +355,5 @@
+@@ -338,2 +352,5 @@
 -            artifact_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/models \
 -            mlflow_tracking_uri=<MLFLOW_TRACKING_URI>"
 +            setup_mlflow=true \

diff --git a/{{cookiecutter.repo_name}}/problem-templates/hdb/conf/process_data.yaml.diff b/{{cookiecutter.repo_name}}/problem-templates/hdb/conf/process_data.yaml.diff
@@ -1,8 +1,9 @@
 --- {{cookiecutter.repo_name}}/conf/process_data.yaml
 +++ {{cookiecutter.repo_name}}/problem-templates/hdb/conf/process_data.yaml
-@@ -2,0 +3,39 @@
+@@ -2,0 +3,40 @@
 +
 +test_size: 0.2
++seed: 1111
 +
 +num_cols: [
 +            "floor_area_sqm",

diff --git a/...cutter.repo_name}}/problem-templates/hdb/docker/{{cookiecutter.repo_name}}-cpu.Dockerfile b/...cutter.repo_name}}/problem-templates/hdb/docker/{{cookiecutter.repo_name}}-cpu.Dockerfile
@@ -0,0 +1,60 @@
+FROM python:3.12-slim AS compile-image
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+ARG NON_ROOT_USER="aisg"
+ARG NON_ROOT_UID="2222"
+ARG NON_ROOT_GID="2222"
+ARG HOME_DIR="/home/${NON_ROOT_USER}"
+
+ARG REPO_DIR="."
+
+RUN useradd -l -m -s /bin/bash -u ${NON_ROOT_UID} ${NON_ROOT_USER}
+
+RUN apt update && \
+    apt -y install curl git build-essential && \
+    apt clean
+
+ENV PYTHONIOENCODING=utf8
+ENV LANG="C.UTF-8"
+ENV LC_ALL="C.UTF-8"
+ENV PATH="${HOME_DIR}/.local/bin:${PATH}"
+
+USER ${NON_ROOT_USER}
+WORKDIR ${HOME_DIR}
+
+COPY --chown=${NON_ROOT_USER}:${NON_ROOT_GID} ${REPO_DIR} {{cookiecutter.repo_name}}
+
+# Install pip requirements
+RUN pip install --prefer-binary -r {{cookiecutter.repo_name}}/requirements.txt
+
+FROM python:3.12-slim
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+ARG NON_ROOT_USER="aisg"
+ARG NON_ROOT_UID="2222"
+ARG NON_ROOT_GID="2222"
+ARG HOME_DIR="/home/${NON_ROOT_USER}"
+
+ARG REPO_DIR="."
+
+RUN useradd -l -m -s /bin/bash -u ${NON_ROOT_UID} ${NON_ROOT_USER}
+
+RUN apt update && \
+    apt -y install curl git && \
+    apt clean
+
+ENV PYTHONIOENCODING=utf8
+ENV LANG="C.UTF-8"
+ENV LC_ALL="C.UTF-8"
+ENV PATH="${HOME_DIR}/.local/bin:${PATH}"
+
+USER ${NON_ROOT_USER}
+WORKDIR ${HOME_DIR}
+
+COPY --from=compile-image ${HOME_DIR}/.local ${HOME_DIR}/.local
+
+COPY --chown=${NON_ROOT_USER}:${NON_ROOT_GID} ${REPO_DIR} {{cookiecutter.repo_name}}
+
+WORKDIR ${HOME_DIR}/{{cookiecutter.repo_name}}