Skip to content

Commit

Permalink
Merge pull request #59 from aisingapore/dev
Browse files Browse the repository at this point in the history
0.5.0 release part 2
  • Loading branch information
Syakyr authored Feb 19, 2025
2 parents f88022b + 2640b4a commit cbe98cd
Show file tree
Hide file tree
Showing 12 changed files with 209 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@
+Coder workspace.
+
+[prob]: ../setting-up/02-preface.md#guides-problem-statement
@@ -12,3 +16 @@
@@ -6,0 +10,6 @@
+!!! info "Volume Mounts"
+
+ We will be mounting the entire repository as a volume in the docker
+ container. Hence the following commands can be executed locally in your
+ repository to download the data and it will mounted into the container.
+
@@ -12,3 +22,2 @@
- echo "Test1" > data1.txt
- echo "Test2" > data2.txt
- echo "Test3" > data3.txt
+ wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
@@ -19,6 +21,4 @@
+ cd ../..
@@ -19,6 +28,4 @@
- ```powershell
- New-Item -ItemType Directory -Path .\data\raw -Force | Out-Null
- Set-Location -Path .\data\raw
Expand All @@ -28,6 +36,6 @@
+ New-Item -ItemType Directory -Force -Path "data/raw" | Out-Null
+ Set-Location -Path "data/raw"
+ Invoke-WebRequest -Uri "https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv" -OutFile "ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"
@@ -28 +28 @@
@@ -28 +35 @@
-data and eventually 'training' a dummy model.
+data and eventually training a resale price prediction model.
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
--- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/docker/06b-job-orchestration.md
+++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/docker/06b-job-orchestration.md
@@ -153,2 +153,7 @@
@@ -39,0 +39,3 @@
+There are other configurables in the `conf/process_data.yaml` which are used
+in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
+
@@ -153,2 +156,7 @@
-dummy_param1: 1.3
-dummy_param2: 0.8
+artifact_dir_path: "./models"
Expand All @@ -10,26 +14,28 @@
+gamma: 1
+max_depth: 5
+seed: 1111
@@ -341,2 +352,2 @@
@@ -341,2 +349,2 @@
- direction: ["minimize", "maximize"]
- study_name: "image-classification"
+ direction: ["minimize"]
+ study_name: "hdb-resale-process"
@@ -347,2 +358,4 @@
@@ -347,2 +355,4 @@
- dummy_param1: range(0.9,1.7,step=0.1)
- dummy_param2: choice(0.7,0.8,0.9)
+ n_estimators: range(50, 200, step=10)
+ lr: tag(log, interval(0.1, 0.6))
+ gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
+ max_depth: range(2,20,step=1)
@@ -375 +388 @@
@@ -375 +375 @@
- return args["dummy_param1"], args["dummy_param2"]
+ return test_rmse ## or any other metrics
@@ -382 +395 @@
@@ -382 +392 @@
- direction: ["minimize", "maximize"]
+ direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
@@ -386,2 +399,2 @@
@@ -386,3 +396,3 @@
-In the training script the returned variables are to contain values
-that we seek to optimise for. In this case, we seek to minimise the
-loss and maximise the accuracy. The `hydra.sweeper.direction` field in
+In the training script the returned variables are to contain **values
+that we seek to optimise for**. In this case, we seek to minimise the
+root mean square error. The `hydra.sweeper.direction` field in
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@
+machine to execute some of the steps of the end-to-end machine learning
+workflow. Hence, we can begin by creating a virtual environment that
+will contain all the dependencies required for this guide.
@@ -14,0 +18,7 @@
+If you have an Nvidia GPU, you can make use of the YAML configuration
+that make use of that GPU:
@@ -14,0 +17,5 @@
+In this exercise, the template includes training with a XGBoost Regressor model.
+The XGBoost binary packages support the GPU algorithm (device=cuda:0) on
+machines with NVIDIA GPUs, hence no extra packages are needed (unlike Pytorch
+in Exercise 1). We do need to create a GPU specific conda environment.
+
+```bash
+conda env create -f {{cookiecutter.repo_name}}-conda-env-gpu.yaml
+```
+
@@ -16,4 +26,7 @@
@@ -16,4 +24,7 @@
- Conda environment configured using the YAML file does not take into
- account whether you need extra requirements to use your GPU for
- training/inference. Check the instructions on your ML/AI framework
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
+Coder workspace.
+
+[prob]: ../setting-up/02-preface.md#guides-problem-statement
@@ -12,3 +16 @@
@@ -12,3 +16,2 @@
- echo "Test1" > data1.txt
- echo "Test2" > data2.txt
- echo "Test3" > data3.txt
+ wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
@@ -19,6 +21,4 @@
+ cd ../..
@@ -19,6 +22,4 @@
- ```powershell
- New-Item -ItemType Directory -Path .\data\raw -Force | Out-Null
- Set-Location -Path .\data\raw
Expand All @@ -28,6 +29,6 @@
+ New-Item -ItemType Directory -Force -Path "data/raw" | Out-Null
+ Set-Location -Path "data/raw"
+ Invoke-WebRequest -Uri "https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv" -OutFile "ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"
@@ -28 +28 @@
@@ -28 +29 @@
-data and eventually 'training' a dummy model.
+data and eventually training a resale price prediction model.
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
--- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/local/06a-job-orchestration.md
+++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/local/06a-job-orchestration.md
@@ -49,4 +49,0 @@
@@ -43,0 +43,3 @@
+There are other configurables in the `conf/process_data.yaml` which are used
+in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
+
@@ -49,4 +52,0 @@
-# Add no_cuda=False at the end to enable GPU use.
-# Make sure you have installed CUDA/RoCM before using.
-# Check that LD_LIBRARY_PATH has been set.
-# Also set HIP_VISIBLE_DEVICES=0 if RoCM is used.
@@ -96,2 +96,7 @@
@@ -96,2 +95,7 @@
-dummy_param1: 1.3
-dummy_param2: 0.8
+artifact_dir_path: "./models"
Expand All @@ -15,21 +19,24 @@
+gamma: 1
+max_depth: 5
+seed: 1111
@@ -166,2 +177,2 @@
@@ -166,2 +170,2 @@
- direction: ["minimize", "maximize"]
- study_name: "image-classification"
+ direction: ["minimize"]
+ study_name: "hdb-resale-process"
@@ -172,2 +183,4 @@
@@ -172,2 +176,4 @@
- dummy_param1: range(0.9,1.7,step=0.1)
- dummy_param2: choice(0.7,0.8,0.9)
+ n_estimators: range(50, 200, step=10)
+ lr: tag(log, interval(0.1, 0.6))
+ gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
+ max_depth: range(2,20,step=1)
@@ -200 +213 @@
@@ -200 +206 @@
- return args["dummy_param1"], args["dummy_param2"]
+ return test_rmse ## or any other metrics
@@ -207 +220 @@
@@ -207 +213 @@
- direction: ["minimize", "maximize"]
+ direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
@@ -213 +219 @@
-loss and maximise the accuracy. The `hydra.sweeper.direction` field in
+root mean square error. The `hydra.sweeper.direction` field in
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,22 @@
+[prob]: ../setting-up/02-preface.md#guides-problem-statement
+
+=== "Coder Workspace Terminal"
@@ -11,4 +15,2 @@
@@ -11,4 +16,3 @@
- mkdir -p /<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/data/raw && cd "$_"
- echo "Test1" > data1.txt
- echo "Test2" > data2.txt
- echo "Test3" > data3.txt
+ mkdir -p ./data/raw && cd "$_"
+ mkdir -p /<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/data/raw && cd "$_"
+ wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
@@ -16,0 +19,7 @@
+ cd ../..
@@ -16,0 +20,7 @@
+!!! info
+ The sample data for this guide's problem statement is made
+ accessible to the public. Hence any team or individual can download
+ it. It is highly likely that your project's data is not publicly
+ accessible and neither should it be, especially if it is a 100E
+ project.
+
@@ -18 +28 @@
@@ -18 +29 @@
-data and eventually 'training' a dummy model.
+data and eventually training a resale price prediction model.
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
--- {{cookiecutter.repo_name}}/aisg-context/guide-site/docs/runai/06c-job-orchestration.md
+++ {{cookiecutter.repo_name}}/problem-templates/hdb/aisg-context/guide-site/docs/runai/06c-job-orchestration.md
@@ -54 +54 @@
-=== "VSCode Server Terminal"
+=== "Coder Workspace Terminal"
@@ -73 +73 @@
-=== "VSCode Server Terminal"
+=== "Coder Workspace Terminal"
@@ -144,2 +144,7 @@
@@ -49,0 +49,3 @@
+There are other configurables in the `conf/process_data.yaml` which are used
+in data preparation scripts found in `src/{{cookiecutter.src_package_name}}/data_prep`.
+
@@ -144,2 +147,7 @@
-dummy_param1: 1.3
-dummy_param2: 0.8
+artifact_dir_path: "./models"
Expand All @@ -16,44 +14,38 @@
+gamma: 1
+max_depth: 5
+seed: 1111
@@ -157 +168 @@
-=== "VSCode Server Terminal"
+=== "Coder Workspace Terminal"
@@ -176 +187 @@
-=== "VSCode Server Terminal"
+=== "Coder Workspace Terminal"
@@ -191,2 +202,5 @@
@@ -191,2 +199,5 @@
- artifact_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/models \
- mlflow_tracking_uri=<MLFLOW_TRACKING_URI>"
+ setup_mlflow=true \
+ mlflow_tracking_uri=<MLFLOW_TRACKING_URI> \
+ mlflow_exp_name=<NAME_OF_DEFAULT_MLFLOW_EXPERIMENT> \
+ model_checkpoint_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/{{cookiecutter.repo_name}}/models \
+ epochs=3"
@@ -259,2 +273,2 @@
@@ -259,2 +270,2 @@
- direction: ["minimize", "maximize"]
- study_name: "image-classification"
+ direction: ["minimize"]
+ study_name: "hdb-resale-process"
@@ -265,2 +279,4 @@
@@ -265,2 +276,4 @@
- dummy_param1: range(0.9,1.7,step=0.1)
- dummy_param2: choice(0.7,0.8,0.9)
+ n_estimators: range(50, 200, step=10)
+ lr: tag(log, interval(0.1, 0.6))
+ gamma: choice(0,0.1,0.2,0.3,0.4,0.5)
+ max_depth: range(2,20,step=1)
@@ -293 +309 @@
@@ -293 +306 @@
- return args["dummy_param1"], args["dummy_param2"]
+ return test_rmse ## or any other metrics
@@ -300 +316 @@
@@ -300 +313 @@
- direction: ["minimize", "maximize"]
+ direction: ["minimize"] ## or ["maximise"], if you're looking to maximise the test_rmse value
@@ -323 +339 @@
-=== "VSCode Server Terminal"
+=== "Coder Workspace Terminal"
@@ -335,0 +352 @@
@@ -306 +319 @@
-loss and maximise the accuracy. The `hydra.sweeper.direction` field in
+root mean square error. The `hydra.sweeper.direction` field in
@@ -335,0 +348 @@
+ -e OMP_NUM_THREADS=2 \
@@ -338,2 +355,5 @@
@@ -338,2 +352,5 @@
- artifact_dir_path=/<NAME_OF_DATA_SOURCE>/workspaces/<YOUR_HYPHENATED_NAME>/models \
- mlflow_tracking_uri=<MLFLOW_TRACKING_URI>"
+ setup_mlflow=true \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
--- {{cookiecutter.repo_name}}/conf/process_data.yaml
+++ {{cookiecutter.repo_name}}/problem-templates/hdb/conf/process_data.yaml
@@ -2,0 +3,39 @@
@@ -2,0 +3,40 @@
+
+test_size: 0.2
+seed: 1111
+
+num_cols: [
+ "floor_area_sqm",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
FROM python:3.12-slim AS compile-image

ARG DEBIAN_FRONTEND="noninteractive"

ARG NON_ROOT_USER="aisg"
ARG NON_ROOT_UID="2222"
ARG NON_ROOT_GID="2222"
ARG HOME_DIR="/home/${NON_ROOT_USER}"

ARG REPO_DIR="."

RUN useradd -l -m -s /bin/bash -u ${NON_ROOT_UID} ${NON_ROOT_USER}

RUN apt update && \
apt -y install curl git build-essential && \
apt clean

ENV PYTHONIOENCODING=utf8
ENV LANG="C.UTF-8"
ENV LC_ALL="C.UTF-8"
ENV PATH="${HOME_DIR}/.local/bin:${PATH}"

USER ${NON_ROOT_USER}
WORKDIR ${HOME_DIR}

COPY --chown=${NON_ROOT_USER}:${NON_ROOT_GID} ${REPO_DIR} {{cookiecutter.repo_name}}

# Install pip requirements
RUN pip install --prefer-binary -r {{cookiecutter.repo_name}}/requirements.txt

FROM python:3.12-slim

ARG DEBIAN_FRONTEND="noninteractive"

ARG NON_ROOT_USER="aisg"
ARG NON_ROOT_UID="2222"
ARG NON_ROOT_GID="2222"
ARG HOME_DIR="/home/${NON_ROOT_USER}"

ARG REPO_DIR="."

RUN useradd -l -m -s /bin/bash -u ${NON_ROOT_UID} ${NON_ROOT_USER}

RUN apt update && \
apt -y install curl git && \
apt clean

ENV PYTHONIOENCODING=utf8
ENV LANG="C.UTF-8"
ENV LC_ALL="C.UTF-8"
ENV PATH="${HOME_DIR}/.local/bin:${PATH}"

USER ${NON_ROOT_USER}
WORKDIR ${HOME_DIR}

COPY --from=compile-image ${HOME_DIR}/.local ${HOME_DIR}/.local

COPY --chown=${NON_ROOT_USER}:${NON_ROOT_GID} ${REPO_DIR} {{cookiecutter.repo_name}}

WORKDIR ${HOME_DIR}/{{cookiecutter.repo_name}}
Loading

0 comments on commit cbe98cd

Please sign in to comment.