From 256e0ead98add41784b07666c28d20dd68bcea71 Mon Sep 17 00:00:00 2001 From: skirui-source Date: Mon, 26 Jun 2023 22:37:15 -0700 Subject: [PATCH] delete old notebook --- .../rapids-azureml-hpo/notebook.ipynb | 569 ------------------ 1 file changed, 569 deletions(-) delete mode 100644 source/examples/rapids-azureml-hpo/notebook.ipynb diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb deleted file mode 100644 index 2d2c92c6..00000000 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ /dev/null @@ -1,569 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Train and hyperparameter tune with RAPIDS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create an Azure ML [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) and setup environmnet on local computer following the steps in [??????] or run in Compute Instance\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: azure-ai-ml\n", - "Version: 1.2.0\n", - "Summary: Microsoft Azure Machine Learning Client Library for Python\n", - "Home-page: https://github.com/Azure/azure-sdk-for-python\n", - "Author: Microsoft Corporation\n", - "Author-email: azuresdkengsysadmins@microsoft.com\n", - "License: MIT License\n", - "Location: /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages\n", - "Requires: azure-common, azure-core, azure-mgmt-core, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, opencensus-ext-azure, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions\n", - "Required-by: \n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# verify Azure ML SDK version\n", - "\n", - "%pip show azure-ai-ml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize workspace" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initialize `MLClient` class to handle the workspace you created in the prerequisites step. `MLClient.from_config(credential, path)`\n", - "creates a workspace object from the details stored in `config.json`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Workspace name: rapids-aml-cluster\n", - "Subscription id: fc4f4a6b-4041-4b1c-8249-854d68edcf62\n", - "Resource group: rapidsai-deployment\n" - ] - } - ], - "source": [ - "from azure.ai.ml import MLClient\n", - "from azure.identity import DefaultAzureCredential\n", - "\n", - "\n", - "# Get a handle to the workspace\n", - "ml_client = MLClient(\n", - " credential=DefaultAzureCredential(),\n", - " subscription_id=\"fc4f4a6b-4041-4b1c-8249-854d68edcf62\",\n", - " resource_group_name=\"rapidsai-deployment\",\n", - " workspace_name=\"rapids-aml-cluster\",\n", - ")\n", - "\n", - "print(\n", - " \"Workspace name: \" + ml_client.workspace_name,\n", - " \"Subscription id: \" + ml_client.subscription_id,\n", - " \"Resource group: \" + ml_client.resource_group_name,\n", - " sep=\"\\n\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Access data from Datastore URI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, we will use 20 million rows (samples) of the airline dataset. The [datastore uri](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-access-data-interactive?tabs=adls#access-data-from-a-datastore-uri-like-a-filesystem-preview) below references a data storage location (path) containing the parquet files, you can download to your local computer or mount the files to your AML compute.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data uri: \n", - " azureml://subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster/datastores/workspaceartifactstore/paths/airline_20000000.parquet\n" - ] - } - ], - "source": [ - "datastore_name = \"workspaceartifactstore\"\n", - "dataset = \"airline_20000000.parquet\"\n", - "\n", - "# Datastore uri format:\n", - "data_uri = f\"azureml://subscriptions/{ml_client.subscription_id}/resourcegroups/{ml_client.resource_group_name}/workspaces/{ml_client.workspace_name}/datastores/{datastore_name}/paths/{dataset}\"\n", - "\n", - "print(\"data uri:\", \"\\n\", data_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create AML compute" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this notebook, we will use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training using a dynamically scalable pool of compute resources.\n", - "\n", - "This notebook will use 10 nodes for hyperparameter optimization, you can modify `max_instances` based on available quota in the desired region. Similar to other Azure ML services, there are limits on AmlCompute, this [article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) includes details on the default limits and how to request more quota." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`size` describes the virtual machine type and size that will be used in the cluster. RAPIDS requires NVIDIA Pascal or newer architecture, so \n", - "you will need to select compute targets from one of the \n", - "[GPU virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) provisioned with P40 and V100 GPUs : `NC_v2`, `NC_v3`, `ND` or `ND_v2` \n", - "\n", - "Let's create an `AmlCompute` cluster of `Standard_NC12s_v3` GPU VMs:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "found compute target. Will use gpu-cluster\n" - ] - } - ], - "source": [ - "from azure.ai.ml.entities import AmlCompute\n", - "\n", - "# specify aml compute name.\n", - "gpu_compute_target = \"gpu-cluster\"\n", - "\n", - "try:\n", - " # let's see if the compute target already exists\n", - " gpu_target = ml_client.compute.get(gpu_compute_target)\n", - " print(f\"found compute target. Will use {gpu_compute_target}\")\n", - "except:\n", - " print(\"Creating a new gpu compute target...\")\n", - "\n", - " gpu_target = AmlCompute(\n", - " name=\"gpu-cluster\",\n", - " type=\"amlcompute\",\n", - " size=\"STANDARD_NC12S_V3\",\n", - " max_instances=5,\n", - " idle_time_before_scale_down=300,\n", - " )\n", - " ml_client.compute.begin_create_or_update(gpu_target).result()\n", - "\n", - " print(\n", - " f\"AMLCompute with name {gpu_target.name} is created, the compute size is {gpu_target.size}\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Prepare training script" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a project directory with your code to run on the remote resource. This includes the training script and additional files your training script depends on. In this example, the training script is provided:\n", - "\n", - "`train_rapids.py`- entry script for RAPIDS Environment, includes loading dataset into cuDF dataframe, training with Random Forest and inference using cuML." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy the training script `train_rapids.py` into your project directory:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "project_folder = \"./train_rapids\" # create folder in same dir\n", - "os.makedirs(project_folder, exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import shutil\n", - "\n", - "\n", - "notebook_path = os.path.realpath(\"__file__\" + \"/../../code\")\n", - "rapids_script = os.path.join(notebook_path, \"train_rapids.py\")\n", - "azure_script = os.path.join(notebook_path, \"rapids_csp_azure.py\")\n", - "\n", - "\n", - "shutil.copy(rapids_script, project_folder)\n", - "shutil.copy(azure_script, project_folder)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train model on the remote compute" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you have your data and training script prepared, you are ready to train on your remote compute:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create experiment\n", - "\n", - "Track all the runs in your workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "experiment_name = \"test_rapids_gpu_cluster\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use Custom Docker Image" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll be using a [custom](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image) RAPIDS docker image to setup the environment. This is available in [rapidsai/rapidsai repo](https://hub.docker.com/r/rapidsai/rapidsai/) on DockerHub." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azure.ai.ml.entities import Environment, BuildContext\n", - "\n", - "env_docker_image = Environment(\n", - " build=BuildContext(path=\"./docker\"),\n", - " name=\"rapids-docker-image-2302\",\n", - " description=\"Rapids v23.02 Environment\",\n", - ")\n", - "\n", - "ml_client.environments.create_or_update(env_docker_image)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Submit the training job " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will configure and run a training job using the`command`class. The [command](https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml?view=azure-python#azure-ai-ml-command) can be used to run standalone jobs or as a function inside pipelines.\n", - "`inputs` is a dictionary of command-line arguments to pass to the training script.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azure.ai.ml import command, Input\n", - "\n", - "\n", - "command_job = command(\n", - " environment=\"rapids-docker-image-2302:2\",\n", - " experiment_name=experiment_name,\n", - " code=project_folder,\n", - " command=\"python train_rapids.py --data_dir ${{inputs.data_dir}} --n_bins ${{inputs.n_bins}} --compute ${{inputs.compute}} --cv_folds ${{inputs.cv_folds}}\\\n", - " --n_estimators ${{inputs.n_estimators}} --max_depth ${{inputs.max_depth}} --max_features ${{inputs.max_features}}\",\n", - " inputs={\n", - " \"data_dir\": Input(type=\"uri_file\", path=data_uri),\n", - " \"n_bins\": 32,\n", - " \"compute\": \"single-GPU\", # multi-GPU for algorithms via Dask\n", - " \"cv_folds\": 5,\n", - " \"n_estimators\": 50,\n", - " \"max_depth\": 10,\n", - " \"max_features\": 1.0,\n", - " },\n", - " compute=\"gpu-cluster\",\n", - ")\n", - "\n", - "\n", - "# submit the command\n", - "returned_job = ml_client.jobs.create_or_update(command_job)\n", - "\n", - "# get a URL for the status of the job\n", - "returned_job.studio_url" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tune model hyperparameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can optimize our model's hyperparameters and improve the accuracy using Azure Machine Learning's hyperparameter tuning capabilities." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start a hyperparameter sweep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define the hyperparameter space to sweep over. We will tune `n_estimators`, `max_depth` and `max_features` parameters. In this example we will use random sampling to try different configuration sets of hyperparameters and maximize `Accuracy`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azure.ai.ml.sweep import Choice, Uniform, MedianStoppingPolicy\n", - "\n", - "command_job_for_sweep = command_job(\n", - " n_estimators=Choice(values=range(50, 500)),\n", - " max_depth=Choice(values=range(5, 19)),\n", - " max_features=Uniform(min_value=0.2, max_value=1.0),\n", - ")\n", - "\n", - "# apply the sweep parameter to obtain the sweep_job\n", - "sweep_job = command_job_for_sweep.sweep(\n", - " compute=\"gpu-cluster\",\n", - " sampling_algorithm=\"random\",\n", - " primary_metric=\"Accuracy\",\n", - " goal=\"Maximize\",\n", - ")\n", - "\n", - "# define the limits for this sweep\n", - "sweep_job.set_limits(max_total_trials=10, max_concurrent_trials=5, timeout=300)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This will launch the RAPIDS training script with parameters that were specified in the cell above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# submit the hpo job\n", - "returned_sweep_job = ml_client.create_or_update(sweep_job)\n", - "\n", - "# get a URL for the status of the job\n", - "returned_sweep_job.studio_url" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Monitor SweepJobs runs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Monitor and view the progress of the machine learning training run with Mlflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Find and register best model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download best trial model output\n", - "\n", - "best_sweep = ml_client.jobs.download(returned_sweep_job.name, output_name=\"best_model\")\n", - "print(best_sweep)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "List the model files uploaded during the run:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Register the folder (and all files in it) as a model named `train-rapids` under the workspace for deployment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Delete cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# gpu_target.delete()" - ] - } - ], - "metadata": { - "kernel_info": { - "name": "rapids" - }, - "kernelspec": { - "display_name": "Python 3.10 - SDK v2", - "language": "python", - "name": "python310-sdkv2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "microsoft": { - "ms_spell_check": { - "ms_spell_check_language": "en" - } - }, - "nteract": { - "version": "nteract-front-end@1.0.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}