From 1df54c478db35f7d50c42e67e36dffc007e43556 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Tue, 8 Oct 2024 16:44:09 -0400 Subject: [PATCH 01/17] deleted $ for multiline commands --- source/cloud/azure/aks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index 3c2d0732..c74facd3 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -23,7 +23,7 @@ $ az login Now we can launch a GPU enabled AKS cluster. First launch an AKS cluster. ```console -$ az aks create -g -n rapids \ + az aks create -g -n rapids \ --enable-managed-identity \ --node-count 1 \ --enable-addons monitoring \ @@ -92,7 +92,7 @@ $ az extension add --name aks-preview ````` ```console -$ az aks nodepool add \ + az aks nodepool add \ --resource-group \ --cluster-name rapids \ --name gpunp \ From 9b7088a0830e506b1e73b3d48577f9b6c3ccea77 Mon Sep 17 00:00:00 2001 From: Melody Wang <98235366+melodywang060@users.noreply.github.com> Date: Wed, 9 Oct 2024 11:06:39 -0400 Subject: [PATCH 02/17] Update source/cloud/azure/aks.md Co-authored-by: Jacob Tomlinson --- source/cloud/azure/aks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index c74facd3..ce2eae3d 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -22,7 +22,7 @@ $ az login Now we can launch a GPU enabled AKS cluster. First launch an AKS cluster. -```console +```bash az aks create -g -n rapids \ --enable-managed-identity \ --node-count 1 \ From f1a8682d9cd3f7cccbd095dd4eb7904dfffdc092 Mon Sep 17 00:00:00 2001 From: Melody Wang <98235366+melodywang060@users.noreply.github.com> Date: Wed, 9 Oct 2024 11:06:47 -0400 Subject: [PATCH 03/17] Update source/cloud/azure/aks.md Co-authored-by: Jacob Tomlinson --- source/cloud/azure/aks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index ce2eae3d..e331b54a 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -23,7 +23,7 @@ $ az login Now we can launch a GPU enabled AKS cluster. First launch an AKS cluster. ```bash - az aks create -g -n rapids \ +az aks create -g -n rapids \ --enable-managed-identity \ --node-count 1 \ --enable-addons monitoring \ From bcf36bcf3d6676589d87b0bf38d1ffccf3daedc8 Mon Sep 17 00:00:00 2001 From: Melody Wang <98235366+melodywang060@users.noreply.github.com> Date: Wed, 9 Oct 2024 11:06:52 -0400 Subject: [PATCH 04/17] Update source/cloud/azure/aks.md Co-authored-by: Jacob Tomlinson --- source/cloud/azure/aks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index e331b54a..d70c95fa 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -91,7 +91,7 @@ $ az extension add --name aks-preview ````` -```console +```bash az aks nodepool add \ --resource-group \ --cluster-name rapids \ From f54e67b3d0b84dab38bd52104ad6f3dea8ecc1f1 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Wed, 9 Oct 2024 14:23:12 -0400 Subject: [PATCH 05/17] fixed multiline command issue --- source/_includes/check-gpu-pod-works.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/_includes/check-gpu-pod-works.md b/source/_includes/check-gpu-pod-works.md index bd5593c7..617a1944 100644 --- a/source/_includes/check-gpu-pod-works.md +++ b/source/_includes/check-gpu-pod-works.md @@ -1,7 +1,7 @@ Let's create a sample pod that uses some GPU compute to make sure that everything is working as expected. -```console -$ cat << EOF | kubectl create -f - +```bash +cat << EOF | kubectl create -f - apiVersion: v1 kind: Pod metadata: From c57b8c3602687ae275eb030c2919070c3263f754 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 11:10:49 -0400 Subject: [PATCH 06/17] added more detailed instructions --- source/cloud/azure/azureml.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cloud/azure/azureml.md b/source/cloud/azure/azureml.md index 5bfc1a99..ffe502ac 100644 --- a/source/cloud/azure/azureml.md +++ b/source/cloud/azure/azureml.md @@ -32,7 +32,7 @@ The compute instance provides an integrated Jupyter notebook service, JupyterLab Sign in to [Azure Machine Learning Studio](https://ml.azure.com/) and navigate to your workspace on the left-side menu. -Select **Compute** > **+ New** > choose a [RAPIDS compatible GPU](https://medium.com/dropout-analytics/which-gpus-work-with-rapids-ai-f562ef29c75f) VM size (e.g., `Standard_NC12s_v3`) +Select **Compute** > **+ New** (Create compute instance) > choose a [RAPIDS compatible GPU](https://medium.com/dropout-analytics/which-gpus-work-with-rapids-ai-f562ef29c75f) VM size (e.g., `Standard_NC12s_v3`) ![Screenshot of create new notebook with a gpu-instance](../../images/azureml-create-notebook-instance.png) From 1205ab9c00a0c5f7f6cf4e2cfede53866ca5714a Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 12:27:46 -0400 Subject: [PATCH 07/17] added clearer user input sections --- source/examples/rapids-azureml-hpo/notebook.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index d4bee24c..f2e3ac73 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -100,9 +100,9 @@ "# Get a handle to the workspace\n", "ml_client = MLClient(\n", " credential=DefaultAzureCredential(),\n", - " subscription_id=\"fc4f4a6b-4041-4b1c-8249-854d68edcf62\",\n", - " resource_group_name=\"rapidsai-deployment\",\n", - " workspace_name=\"rapids-aml-cluster\",\n", + " subscription_id= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", + " resource_group_name= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", + " workspace_name= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", ")\n", "\n", "print(\n", From 33b27db579e3a4a67bea002d470f314db483ada1 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 12:56:39 -0400 Subject: [PATCH 08/17] more descripted title --- source/examples/rapids-azureml-hpo/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index f2e3ac73..9248a42f 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -12,7 +12,7 @@ ] }, "source": [ - "# Train and Hyperparameter-Tune with RAPIDS" + "# Train and Hyperparameter-Tune with RAPIDS on AzureML" ] }, { From 203465810835131c90dbcbb148a348b356baf9f2 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 13:11:56 -0400 Subject: [PATCH 09/17] fixed linting errors --- source/examples/rapids-azureml-hpo/notebook.ipynb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index 9248a42f..4b8ca523 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -97,12 +97,17 @@ "from azure.ai.ml import MLClient\n", "from azure.identity import DefaultAzureCredential\n", "\n", + "\n", + "subscription_id = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", + "resource_group_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", + "workspace_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", + "\n", "# Get a handle to the workspace\n", "ml_client = MLClient(\n", " credential=DefaultAzureCredential(),\n", - " subscription_id= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", - " resource_group_name= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", - " workspace_name= #FILL IN WITH YOUR AZURE ML CREDENTIALS,\n", + " subscription_id= subscription_id,\n", + " resource_group_name= resource_group_name\n", + " workspace_name= workspace_name\n", ")\n", "\n", "print(\n", From 540a35a2539a44c25e2a14848741d1e99a51f63a Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 13:21:37 -0400 Subject: [PATCH 10/17] fixed small linting error --- source/examples/rapids-azureml-hpo/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index 4b8ca523..256a05dc 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -106,7 +106,7 @@ "ml_client = MLClient(\n", " credential=DefaultAzureCredential(),\n", " subscription_id= subscription_id,\n", - " resource_group_name= resource_group_name\n", + " resource_group_name= resource_group_name,\n", " workspace_name= workspace_name\n", ")\n", "\n", From ef7a978ded63daab557508757e768f317fb3911f Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 14:16:02 -0400 Subject: [PATCH 11/17] updated ubuntu versions --- source/guides/azure/infiniband.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md index daca2391..f1e38327 100644 --- a/source/guides/azure/infiniband.md +++ b/source/guides/azure/infiniband.md @@ -13,8 +13,8 @@ for demonstration. - Select `East US` region. - Change `Availability options` to `Availability set` and create a set. - If building multiple instances put additional instances in the same set. -- Use the 2nd Gen Ubuntu 20.04 image. - - Search all images for `Ubuntu Server 20.04` and choose the second one down on the list. +- Use the 2nd Gen Ubuntu 24.04 image. + - Search all images for `Ubuntu Server 24.04` and choose the second one down on the list. - Change size to `ND40rs_v2`. - Set password login with credentials. - User `someuser` @@ -39,8 +39,8 @@ The commands below should work for Ubuntu. See the [CUDA Toolkit documentation]( ```shell sudo apt-get install -y linux-headers-$(uname -r) distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') -wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get -y install cuda-drivers ``` @@ -118,7 +118,7 @@ Mon Nov 14 20:32:39 2022 ### InfiniBand Driver -On Ubuntu 20.04 +On Ubuntu 24.04 ```shell sudo apt-get install -y automake dh-make git libcap2 libnuma-dev libtool make pkg-config udev curl librdmacm-dev rdma-core \ From 8a204a9a11b0aab95398f3452f2772ba70953667 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 14:43:05 -0400 Subject: [PATCH 12/17] got rid of outdated package --- source/guides/azure/infiniband.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md index f1e38327..3d1ec48e 100644 --- a/source/guides/azure/infiniband.md +++ b/source/guides/azure/infiniband.md @@ -122,7 +122,7 @@ On Ubuntu 24.04 ```shell sudo apt-get install -y automake dh-make git libcap2 libnuma-dev libtool make pkg-config udev curl librdmacm-dev rdma-core \ - libgfortran5 bison chrpath flex graphviz gfortran tk dpatch quilt swig tcl ibverbs-utils + libgfortran5 bison chrpath flex graphviz gfortran tk quilt swig tcl ibverbs-utils ``` Check install From 8c3a176ce13ea5101a60816f63ae23291e27619f Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Thu, 10 Oct 2024 14:44:14 -0400 Subject: [PATCH 13/17] added intermediary step for clarity --- source/guides/azure/infiniband.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md index 3d1ec48e..6e3ef981 100644 --- a/source/guides/azure/infiniband.md +++ b/source/guides/azure/infiniband.md @@ -247,14 +247,20 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforg bash Mambaforge-Linux-x86_64.sh ``` -Accept the default and allow conda init to run. Then start a new shell. +Accept the default and allow conda init to run. +``shell +~/mambaforge/bin/conda init + +```` + +Then start a new shell. Create a conda environment (see [UCX-Py](https://ucx-py.readthedocs.io/en/latest/install.html) docs) ```shell mamba create -n ucxpy {{ rapids_conda_channels }} {{ rapids_conda_packages }} ipython ucx-proc=*=gpu ucx ucx-py dask distributed numpy cupy pytest pynvml -y mamba activate ucxpy -``` +```` Clone UCX-Py repo locally From 5aefb03e4326a1631611c4af7236c7ca4e1aab1a Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Fri, 11 Oct 2024 08:50:00 -0400 Subject: [PATCH 14/17] fix linting issues --- source/examples/rapids-azureml-hpo/notebook.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index 256a05dc..14575363 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -97,7 +97,6 @@ "from azure.ai.ml import MLClient\n", "from azure.identity import DefaultAzureCredential\n", "\n", - "\n", "subscription_id = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", "resource_group_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", "workspace_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", @@ -105,9 +104,9 @@ "# Get a handle to the workspace\n", "ml_client = MLClient(\n", " credential=DefaultAzureCredential(),\n", - " subscription_id= subscription_id,\n", - " resource_group_name= resource_group_name,\n", - " workspace_name= workspace_name\n", + " subscription_id=subscription_id,\n", + " resource_group_name=resource_group_name,\n", + " workspace_name=workspace_name,\n", ")\n", "\n", "print(\n", From 7c923d4f711e865e92ca53b70a09e0a1f090fa31 Mon Sep 17 00:00:00 2001 From: Melody Wang Date: Fri, 11 Oct 2024 08:58:33 -0400 Subject: [PATCH 15/17] fixed backtick issues --- source/guides/azure/infiniband.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md index 6e3ef981..2153c707 100644 --- a/source/guides/azure/infiniband.md +++ b/source/guides/azure/infiniband.md @@ -248,10 +248,11 @@ bash Mambaforge-Linux-x86_64.sh ``` Accept the default and allow conda init to run. -``shell + +```shell ~/mambaforge/bin/conda init -```` +``` Then start a new shell. @@ -260,7 +261,7 @@ Create a conda environment (see [UCX-Py](https://ucx-py.readthedocs.io/en/latest ```shell mamba create -n ucxpy {{ rapids_conda_channels }} {{ rapids_conda_packages }} ipython ucx-proc=*=gpu ucx ucx-py dask distributed numpy cupy pytest pynvml -y mamba activate ucxpy -```` +``` Clone UCX-Py repo locally From 19a4bf21749553d1d945574f1aa57fba06b072aa Mon Sep 17 00:00:00 2001 From: Melody Wang <98235366+melodywang060@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:30:40 -0400 Subject: [PATCH 16/17] Update source/cloud/azure/aks.md Co-authored-by: James Lamb --- source/cloud/azure/aks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index d70c95fa..8917fc3c 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -92,7 +92,7 @@ $ az extension add --name aks-preview ````` ```bash - az aks nodepool add \ +az aks nodepool add \ --resource-group \ --cluster-name rapids \ --name gpunp \ From ae423a8c2e86b05b06f83e316cf86400fbebf2fe Mon Sep 17 00:00:00 2001 From: Melody Wang <98235366+melodywang060@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:30:46 -0400 Subject: [PATCH 17/17] Update source/guides/azure/infiniband.md Co-authored-by: James Lamb --- source/guides/azure/infiniband.md | 1 - 1 file changed, 1 deletion(-) diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md index 2153c707..aaaff5a4 100644 --- a/source/guides/azure/infiniband.md +++ b/source/guides/azure/infiniband.md @@ -251,7 +251,6 @@ Accept the default and allow conda init to run. ```shell ~/mambaforge/bin/conda init - ``` Then start a new shell.