From ab82996e2a112b76fca1fa711fd933ab14248f80 Mon Sep 17 00:00:00 2001 From: Jacob Tomlinson Date: Tue, 24 Jan 2023 12:17:48 +0000 Subject: [PATCH] Add some example notebooks and a notebook related files sidebar item (#108) * Copy in a couple of notebooks from cloud-ml-examples for testing * Add custom sphinx extension to show related files in notebook sidebar --- .gitignore | 2 + conda/environments/deployment_docs.yml | 2 +- extensions/rapids_notebook_files.py | 40 + .../_templates/notebooks-extra-files-nav.html | 16 + source/conf.py | 38 +- source/examples/index.md | 43 + .../rapids-sagemaker-higgs/Dockerfile | 12 + .../rapids-sagemaker-higgs/notebook.ipynb | 423 +++++++++ .../rapids-sagemaker-higgs/rapids-higgs.py | 66 ++ .../notebook.ipynb | 808 ++++++++++++++++++ source/index.md | 14 +- 11 files changed, 1449 insertions(+), 15 deletions(-) create mode 100644 extensions/rapids_notebook_files.py create mode 100644 source/_templates/notebooks-extra-files-nav.html create mode 100644 source/examples/index.md create mode 100644 source/examples/rapids-sagemaker-higgs/Dockerfile create mode 100644 source/examples/rapids-sagemaker-higgs/notebook.ipynb create mode 100644 source/examples/rapids-sagemaker-higgs/rapids-higgs.py create mode 100644 source/examples/xgboost-gpu-hpo-job-parallel-k8s/notebook.ipynb diff --git a/.gitignore b/.gitignore index 89efb0f7..0eda6aca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ build *.swp + +__pycache__ diff --git a/conda/environments/deployment_docs.yml b/conda/environments/deployment_docs.yml index c12dfcf6..a0b67055 100644 --- a/conda/environments/deployment_docs.yml +++ b/conda/environments/deployment_docs.yml @@ -7,7 +7,7 @@ dependencies: - myst-parser - nbsphinx - numpydoc - - pydata-sphinx-theme + - pydata-sphinx-theme>=0.12.0 - python=3.9 - pre-commit - sphinx diff --git a/extensions/rapids_notebook_files.py b/extensions/rapids_notebook_files.py new file mode 100644 index 00000000..48486ca5 --- /dev/null +++ b/extensions/rapids_notebook_files.py @@ -0,0 +1,40 @@ +import pathlib + + +def find_notebook_related_files(app, pagename, templatename, context, doctree): + """Find related files for Jupyter Notebooks in the examples section. + + Example notebooks should be placed in /source/examples in their own directories. + This extension walks through the directory when each notebook is rendered and generates + a list of all the other files in the directory. + + The goal is to set a list of GitHub URLs in the template context so we can render + them in the sidebar. To get the GitHub url we use the ``rapids_deployment_notebooks_base_url`` config + option which shows the base url for where the source files are on GitHub. + + """ + if "examples/" in pagename and context["page_source_suffix"] == ".ipynb": + source_root = pathlib.Path(__file__).parent / ".." / "source" + base_url = app.config.rapids_deployment_notebooks_base_url + rel_page_parent = pathlib.Path(pagename).parent + path_to_page_parent = source_root / rel_page_parent + + related_notebook_files = [] + for page in path_to_page_parent.glob("*"): + if "ipynb" not in page.name: + related_notebook_files.append( + f"{base_url}{rel_page_parent}/{page.name}" + ) + + context["related_notebook_files"] = related_notebook_files + + +def setup(app): + app.add_config_value("rapids_deployment_notebooks_base_url", "", "html") + app.connect("html-page-context", find_notebook_related_files) + + return { + "version": "0.1", + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/source/_templates/notebooks-extra-files-nav.html b/source/_templates/notebooks-extra-files-nav.html new file mode 100644 index 00000000..58ce4605 --- /dev/null +++ b/source/_templates/notebooks-extra-files-nav.html @@ -0,0 +1,16 @@ +{% if related_notebook_files %} +
+ Related files +
+ +{% endif %} diff --git a/source/conf.py b/source/conf.py index 164b5b3e..bb7239af 100644 --- a/source/conf.py +++ b/source/conf.py @@ -10,17 +10,18 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys + +sys.path.insert(0, os.path.abspath("../extensions")) import datetime # -- Project information ----------------------------------------------------- -project = 'RAPIDS Deployment Documentation' -copyright = f'{datetime.date.today().year}, NVIDIA' -author = 'NVIDIA' +project = "RAPIDS Deployment Documentation" +copyright = f"{datetime.date.today().year}, NVIDIA" +author = "NVIDIA" # -- General configuration --------------------------------------------------- @@ -34,18 +35,25 @@ "sphinxcontrib.mermaid", "sphinx_design", "sphinx_copybutton", + "rapids_notebook_files", ] myst_enable_extensions = ["colon_fence"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] +# -- Options for notebooks ------------------------------------------------- + +nb_execution_mode = "off" +rapids_deployment_notebooks_base_url = ( + "https://github.com/rapidsai/deployment/blob/main/source/" +) # -- Options for HTML output ------------------------------------------------- @@ -55,18 +63,23 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "secondary_sidebar_items": [ + "page-toc", + "notebooks-extra-files-nav", + ], } + # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'pydata_sphinx_theme' +html_theme = "pydata_sphinx_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] -html_logo = '_static/RAPIDS-logo-purple.png' +html_static_path = ["_static"] +html_logo = "_static/RAPIDS-logo-purple.png" intersphinx_mapping = { "python": ("https://docs.python.org/3", None), @@ -76,7 +89,10 @@ "dask_cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None), } + def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") - app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer") + app.add_js_file( + "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer" + ) app.add_js_file("js/nav.js", loading_method="defer") diff --git a/source/examples/index.md b/source/examples/index.md new file mode 100644 index 00000000..c0f3ef13 --- /dev/null +++ b/source/examples/index.md @@ -0,0 +1,43 @@ +--- +html_theme.sidebar_secondary.remove: true +--- + +# Workflow Examples + +`````{grid} 1 2 2 3 +:gutter: 2 2 2 2 + +````{grid-item-card} +:link: xgboost-gpu-hpo-job-parallel-k8s/notebook +:link-type: doc +XGBoost HPO +^^^ +Scaling up hyperparameter optimization with Kubernetes and XGBoost GPU algorithm + +{bdg-primary}`Dask` +{bdg-primary}`XGBoost` +{bdg-primary}`Kubernetes` +```` + +````{grid-item-card} +:link: rapids-sagemaker-higgs/notebook +:link-type: doc +HPO on Sagemaker with cuML +^^^ +Running RAPIDS hyperparameter experiments at scale on Amazon SageMaker + +{bdg-primary}`cuML` +{bdg-primary}`Sagemaker` +{bdg-primary}`AWS` +```` + +````` + +```{toctree} +:maxdepth: 2 +:caption: Workflow Examples +:hidden: + +xgboost-gpu-hpo-job-parallel-k8s/notebook +rapids-sagemaker-higgs/notebook +``` diff --git a/source/examples/rapids-sagemaker-higgs/Dockerfile b/source/examples/rapids-sagemaker-higgs/Dockerfile new file mode 100644 index 00000000..b548cdf6 --- /dev/null +++ b/source/examples/rapids-sagemaker-higgs/Dockerfile @@ -0,0 +1,12 @@ +# FROM rapidsai/rapidsai-cloud-ml:latest +FROM rapidsai/rapidsai-nightly:22.12-cuda11.5-runtime-ubuntu18.04-py3.9 + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential + +RUN source activate rapids && pip install sagemaker-training + +# Copies the training code inside the container +COPY rapids-higgs.py /opt/ml/code/rapids-higgs.py + +# Defines rapids-higgs.py as script entry point +ENV SAGEMAKER_PROGRAM rapids-higgs.py \ No newline at end of file diff --git a/source/examples/rapids-sagemaker-higgs/notebook.ipynb b/source/examples/rapids-sagemaker-higgs/notebook.ipynb new file mode 100644 index 00000000..125f0426 --- /dev/null +++ b/source/examples/rapids-sagemaker-higgs/notebook.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Running RAPIDS hyperparameter experiments at scale on Amazon SageMaker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import packages and create Amazon SageMaker and Boto3 sessions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import time\n", + "import boto3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "execution_role = sagemaker.get_execution_role()\n", + "session = sagemaker.Session()\n", + "\n", + "region = boto3.Session().region_name\n", + "account = boto3.client('sts').get_caller_identity().get('Account')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "account, region" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload the higgs-boson dataset to s3 bucket" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir dataset\n", + "!wget -P dataset https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz\n", + "!gunzip dataset/HIGGS.csv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_data_dir = session.upload_data(path='dataset', key_prefix='dataset/higgs-dataset')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_data_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Download latest RAPIDS container with cloud-ml examples\n", + "\n", + "Extend the container by copying the training script and installing [SageMaker Training toolkit](https://github.com/aws/sagemaker-training-toolkit) to makes RAPIDS compatible with SageMaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# estimator_info = {\n", + "# 'rapids_container': 'rapidsai/rapidsai-cloud-ml:latest',\n", + "# 'ecr_image': 'sagemaker-rapids-cloud-ml:latest',\n", + "# 'ecr_repository': 'sagemaker-rapids-cloud-ml'\n", + "# }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "estimator_info = {\n", + " 'rapids_container':'rapidsai/rapidsai-nightly:22.12-cuda11.5-runtime-ubuntu18.04-py3.9',\n", + " 'ecr_image':'sagemaker-rapids-nightly',\n", + " 'ecr_repository':'sagemaker-rapids-nightly'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "!docker pull {estimator_info['rapids_container']}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!cat docker/Dockerfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#FROM rapidsai/rapidsai-cloud-ml:latest\n", + "#!docker build -t sagemaker-rapids:latest docker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker build -t sagemaker-rapids-nightly docker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker images" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Publish to Elastic Container Registry" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: SageMaker does not support using training images from private docker registry (ie. DockerHub), so we need to push\n", + "the SageMaker-compatible \\\n", + "RAPIDS container to the Amazon Elastic Container Registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ECR_container_fullname = f\"{account}.dkr.ecr.{region}.amazonaws.com/{estimator_info['ecr_image']}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ECR_container_fullname " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker tag {estimator_info['rapids_container']} {ECR_container_fullname}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print( f\"source : {estimator_info['rapids_container']}\\n\"\n", + " f\"destination : {ECR_container_fullname}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws ecr create-repository --repository-name {estimator_info['ecr_repository']}\n", + "!$(aws ecr get-login --no-include-email --region {region})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!docker push {ECR_container_fullname}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Define hyperparameters: start with best guess values\n", + "Find the full list of Random Forest hyperparameters here in the RAPIDS doc page:\n", + "
\n", + "https://docs.rapids.ai/api/cuml/stable/api.html#random-forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparams={ \n", + " 'n_estimators' : 15,\n", + " 'max_depth' : 5,\n", + " 'n_bins' : 8,\n", + " 'split_criterion' : 0, # GINI:0, ENTROPY:1\n", + " 'bootstrap' : 0, # true: sample with replacement, false: sample without replacement\n", + " 'max_leaves' : -1, # unlimited leaves\n", + " 'max_features' : 0.2, \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.estimator import Estimator\n", + "\n", + "rapids_estimator = Estimator(image_uri=ECR_container_fullname,\n", + " role=execution_role,\n", + " instance_count=1,\n", + " instance_type='ml.g4dn.4xlarge',\n", + " hyperparameters=hyperparams,\n", + " metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\\\.]+)'}])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "rapids_estimator.fit(inputs = s3_data_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner\n", + "\n", + "hyperparameter_ranges = {\n", + " 'n_estimators' : IntegerParameter(10, 200), \n", + " 'max_depth' : IntegerParameter(1, 22),\n", + " 'n_bins' : IntegerParameter(5, 24),\n", + " 'split_criterion' : CategoricalParameter([0, 1]),\n", + " 'bootstrap' : CategoricalParameter([True, False]),\n", + " 'max_features' : ContinuousParameter(0.01, 0.5),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.estimator import Estimator\n", + "\n", + "rapids_estimator = Estimator(image_uri=image,\n", + " role=execution_role,\n", + " instance_count=1,\n", + " instance_type='ml.p3.2xlarge',\n", + " hyperparameters=hyperparams,\n", + " metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\\\.]+)'}])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tuner = HyperparameterTuner(rapids_estimator,\n", + " objective_metric_name='test_acc',\n", + " hyperparameter_ranges=hyperparameter_ranges,\n", + " strategy='Bayesian',\n", + " max_jobs=1,\n", + " max_parallel_jobs=1,\n", + " objective_type='Maximize',\n", + " metric_definitions=[{'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\\\.]+)'}])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_name = 'rapidsHPO' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())\n", + "tuner.fit({'dataset': s3_data_dir}, job_name=job_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Delete S3 buckets and files you don't need\n", + "- Kill training jobs that you don't want running\n", + "- Delete container images and the repository you just created" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aws ecr delete-repository --force --repository-name" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deployment-docs-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) \n[GCC 10.3.0]" + }, + "vscode": { + "interpreter": { + "hash": "f8a5d5d9459d186eb4ada656f03b674a34a7efd585f7a94c29caa9bfea516aa8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/source/examples/rapids-sagemaker-higgs/rapids-higgs.py b/source/examples/rapids-sagemaker-higgs/rapids-higgs.py new file mode 100644 index 00000000..b0818e53 --- /dev/null +++ b/source/examples/rapids-sagemaker-higgs/rapids-higgs.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# coding: utf-8 + +from cuml import RandomForestClassifier as cuRF +from cuml.preprocessing.model_selection import train_test_split +import cudf +import numpy as np +import pandas as pd +from sklearn.metrics import accuracy_score +import os +from urllib.request import urlretrieve +import gzip +import argparse + + +def main(args): + + # SageMaker options + model_dir = args.model_dir + data_dir = args.data_dir + + col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names + dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30)] # Assign dtypes to each column + + data = cudf.read_csv(data_dir+'HIGGS.csv', names=col_names, dtype=dtypes_ls) + X_train, X_test, y_train, y_test = train_test_split(data, 'label', train_size=0.70) + + # Hyper-parameters + hyperparams={ + 'n_estimators' : args.n_estimators, + 'max_depth' : args.max_depth, + 'n_bins' : args.n_bins, + 'split_criterion' : args.split_criterion, + 'bootstrap' : args.bootstrap, + 'max_leaves' : args.max_leaves, + 'max_features' : args.max_features + } + + cu_rf = cuRF(**hyperparams) + cu_rf.fit(X_train, y_train) + + print("test_acc:", accuracy_score(cu_rf.predict(X_test), y_test) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + # Hyper-parameters + parser.add_argument('--n_estimators', type=int, default=20) + parser.add_argument('--max_depth', type=int, default=16) + parser.add_argument('--n_bins', type=int, default=8) + parser.add_argument('--split_criterion', type=int, default=0) + parser.add_argument('--bootstrap', type=bool, default=True) + parser.add_argument('--max_leaves', type=int, default=-1) + parser.add_argument('--max_features', type=float, default=0.2) + + # SageMaker parameters + parser.add_argument('--model_dir', type=str) + parser.add_argument('--model_output_dir', type=str, default='/opt/ml/output/') + parser.add_argument('--data_dir', type=str, default='/opt/ml/input/data/dataset/') + + args = parser.parse_args() + main(args) + + diff --git a/source/examples/xgboost-gpu-hpo-job-parallel-k8s/notebook.ipynb b/source/examples/xgboost-gpu-hpo-job-parallel-k8s/notebook.ipynb new file mode 100644 index 00000000..29449e3d --- /dev/null +++ b/source/examples/xgboost-gpu-hpo-job-parallel-k8s/notebook.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "45684b2f-019b-4ccf-af65-b722aa270e40", + "metadata": {}, + "source": [ + "# Scaling up hyperparameter optimization with Kubernetes and XGBoost GPU algorithm\n", + "\n", + "Choosing an optimal set of hyperparameters is a daunting task, especially for algorithms like XGBoost that have many hyperparameters to tune. In this notebook, we will show how to speed up hyperparameter optimization by running multiple training jobs in parallel on a Kubernetes cluster.\n", + "\n", + "# Prerequisites\n", + "Please follow instructions in [Dask Operator: Installation](https://docs.rapids.ai/deployment/stable/tools/kubernetes/dask-operator.html#installation) to install the Dask operator on top of a GPU-enabled Kubernetes cluster. (For the purpose of this example, you may ignore other sections of the linked document.)\n", + "\n", + "## Optional: Kubeflow\n", + "Kubeflow gives you a nice notebook environment to run this notebook within the k8s cluster. Install Kubeflow by following instructions in [Installing Kubeflow](https://www.kubeflow.org/docs/started/installing-kubeflow/). You may choose any method; we tested this example after installing Kubeflow from manifests." + ] + }, + { + "cell_type": "markdown", + "id": "19c0c861-9357-48b1-a317-a9df3deb2319", + "metadata": {}, + "source": [ + "# Install system packages" + ] + }, + { + "cell_type": "markdown", + "id": "a3177609-0300-461b-a961-8b3a62943e9e", + "metadata": {}, + "source": [ + "We'll need extra Python packages. In particular, we need an unreleased version of Optuna:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eabb24c4-4358-49a5-9770-88a49bc27689", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/optuna/optuna.git@bc6c05dc655aab7e7a02e91e7306609f2a4524ec\n", + " Cloning https://github.com/optuna/optuna.git (to revision bc6c05dc655aab7e7a02e91e7306609f2a4524ec) to /tmp/pip-req-build-vqgp3w_0\n", + " Running command git clone --filter=blob:none --quiet https://github.com/optuna/optuna.git /tmp/pip-req-build-vqgp3w_0\n", + " Running command git rev-parse -q --verify 'sha^bc6c05dc655aab7e7a02e91e7306609f2a4524ec'\n", + " Running command git fetch -q https://github.com/optuna/optuna.git bc6c05dc655aab7e7a02e91e7306609f2a4524ec\n", + " Running command git checkout -q bc6c05dc655aab7e7a02e91e7306609f2a4524ec\n", + " Resolved https://github.com/optuna/optuna.git to commit bc6c05dc655aab7e7a02e91e7306609f2a4524ec\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hCollecting dask_kubernetes\n", + " Downloading dask_kubernetes-2022.10.1-py3-none-any.whl (145 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m145.5/145.5 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: dask>=2022.08.1 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask_kubernetes) (2022.9.2)\n", + "Requirement already satisfied: distributed>=2022.08.1 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask_kubernetes) (2022.9.2)\n", + "Collecting kubernetes-asyncio>=12.0.1\n", + " Downloading kubernetes_asyncio-24.2.2-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m35.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting kopf>=1.35.3\n", + " Downloading kopf-1.36.0-py3-none-any.whl (205 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m205.9/205.9 kB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting kubernetes>=12.0.1\n", + " Downloading kubernetes-25.3.0-py2.py3-none-any.whl (1.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m65.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting cmaes>=0.8.2\n", + " Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)\n", + "Collecting importlib-metadata<5.0.0\n", + " Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)\n", + "Collecting colorlog\n", + " Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: tqdm in /opt/conda/envs/rapids/lib/python3.9/site-packages (from optuna==3.1.0.dev0) (4.64.1)\n", + "Collecting alembic>=1.5.0\n", + " Downloading alembic-1.8.1-py3-none-any.whl (209 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.8/209.8 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting cliff\n", + " Downloading cliff-4.0.0-py3-none-any.whl (80 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.0/81.0 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /opt/conda/envs/rapids/lib/python3.9/site-packages (from optuna==3.1.0.dev0) (1.23.4)\n", + "Requirement already satisfied: PyYAML in /opt/conda/envs/rapids/lib/python3.9/site-packages (from optuna==3.1.0.dev0) (6.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from optuna==3.1.0.dev0) (21.3)\n", + "Collecting sqlalchemy>=1.3.0\n", + " Downloading SQLAlchemy-1.4.44-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m71.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting scipy>=1.7.0\n", + " Downloading scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.8/33.8 MB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting Mako\n", + " Downloading Mako-1.2.4-py3-none-any.whl (78 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.7/78.7 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: partd>=0.3.10 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask>=2022.08.1->dask_kubernetes) (1.3.0)\n", + "Requirement already satisfied: fsspec>=0.6.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask>=2022.08.1->dask_kubernetes) (2022.10.0)\n", + "Requirement already satisfied: cloudpickle>=1.1.1 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask>=2022.08.1->dask_kubernetes) (2.2.0)\n", + "Requirement already satisfied: toolz>=0.8.2 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from dask>=2022.08.1->dask_kubernetes) (0.12.0)\n", + "Requirement already satisfied: zict>=0.1.3 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (2.2.0)\n", + "Requirement already satisfied: locket>=1.0.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.0)\n", + "Requirement already satisfied: psutil>=5.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (5.9.3)\n", + "Requirement already satisfied: jinja2 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.1.2)\n", + "Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (2.4.0)\n", + "Requirement already satisfied: tornado<6.2,>=6.0.3 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (6.1)\n", + "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.7.0)\n", + "Requirement already satisfied: click>=6.6 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (8.1.3)\n", + "Requirement already satisfied: urllib3 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.26.11)\n", + "Requirement already satisfied: msgpack>=0.6.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.4)\n", + "Requirement already satisfied: zipp>=0.5 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from importlib-metadata<5.0.0->optuna==3.1.0.dev0) (3.10.0)\n", + "Requirement already satisfied: aiohttp<4.0.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kopf>=1.35.3->dask_kubernetes) (3.8.3)\n", + "Collecting python-json-logger\n", + " Downloading python_json_logger-2.0.4-py3-none-any.whl (7.8 kB)\n", + "Requirement already satisfied: typing-extensions in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kopf>=1.35.3->dask_kubernetes) (4.4.0)\n", + "Collecting iso8601\n", + " Downloading iso8601-1.1.0-py3-none-any.whl (9.9 kB)\n", + "Requirement already satisfied: requests in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.28.1)\n", + "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.4.1)\n", + "Requirement already satisfied: setuptools>=21.0.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (60.10.0)\n", + "Collecting google-auth>=1.0.1\n", + " Downloading google_auth-2.14.1-py2.py3-none-any.whl (175 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m175.4/175.4 kB\u001b[0m \u001b[31m28.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: certifi>=14.05.14 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2022.9.24)\n", + "Collecting requests-oauthlib\n", + " Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n", + "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.8.2)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.16.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from packaging>=20.0->optuna==3.1.0.dev0) (3.0.9)\n", + "Collecting greenlet!=0.4.17\n", + " Downloading greenlet-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (535 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m535.9/535.9 kB\u001b[0m \u001b[31m47.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting cmd2>=1.0.0\n", + " Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.1/147.1 kB\u001b[0m \u001b[31m20.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting autopage>=0.4.0\n", + " Downloading autopage-0.5.1-py3-none-any.whl (29 kB)\n", + "Collecting PrettyTable>=0.7.2\n", + " Downloading prettytable-3.5.0-py3-none-any.whl (26 kB)\n", + "Collecting stevedore>=2.0.1\n", + " Downloading stevedore-4.1.1-py3-none-any.whl (50 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.0/50.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (1.8.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (1.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (4.0.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (22.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (6.0.2)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from aiohttp<4.0.0->kopf>=1.35.3->dask_kubernetes) (2.1.1)\n", + "Collecting pyperclip>=1.6\n", + " Downloading pyperclip-1.8.2.tar.gz (20 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: wcwidth>=0.1.7 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from cmd2>=1.0.0->cliff->optuna==3.1.0.dev0) (0.2.5)\n", + "Collecting pyasn1-modules>=0.2.1\n", + " Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m155.3/155.3 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes) (5.2.0)\n", + "Collecting rsa<5,>=3.1.4\n", + " Downloading rsa-4.9-py3-none-any.whl (34 kB)\n", + "Collecting pbr!=2.1.0,>=2.0.0\n", + " Downloading pbr-5.11.0-py2.py3-none-any.whl (112 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/112.6 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: heapdict in /opt/conda/envs/rapids/lib/python3.9/site-packages (from zict>=0.1.3->distributed>=2022.08.1->dask_kubernetes) (1.0.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from jinja2->distributed>=2022.08.1->dask_kubernetes) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/rapids/lib/python3.9/site-packages (from requests->kubernetes>=12.0.1->dask_kubernetes) (3.4)\n", + "Collecting oauthlib>=3.0.0\n", + " Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pyasn1<0.5.0,>=0.4.6\n", + " Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hBuilding wheels for collected packages: optuna, pyperclip\n", + " Building wheel for optuna (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for optuna: filename=optuna-3.1.0.dev0-py3-none-any.whl size=363646 sha256=660c6da6ae9e5b7e43406a89e75e9a4b718702e262326001ada3b081471aa989\n", + " Stored in directory: /root/.cache/pip/wheels/c7/d3/1e/ded6238203a93b2b7e5da2aa2f2d97f5980e7eca6ff9d0d6c0\n", + " Building wheel for pyperclip (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for pyperclip: filename=pyperclip-1.8.2-py3-none-any.whl size=11137 sha256=5cb0314e8c7018bc3ff6568cdd97a4a7c759b8c831d33be9d6c84f7afcc6e9bd\n", + " Stored in directory: /root/.cache/pip/wheels/23/12/dd/067d543d895c9036459c68b02a9dcd66c7aad675ef90cbd3ab\n", + "Successfully built optuna pyperclip\n", + "Installing collected packages: pyperclip, pyasn1, scipy, rsa, python-json-logger, pyasn1-modules, PrettyTable, pbr, oauthlib, Mako, iso8601, importlib-metadata, greenlet, colorlog, cmd2, cmaes, autopage, stevedore, sqlalchemy, requests-oauthlib, google-auth, kubernetes-asyncio, kubernetes, kopf, cliff, alembic, optuna, dask_kubernetes\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.6.0\n", + " Uninstalling scipy-1.6.0:\n", + " Successfully uninstalled scipy-1.6.0\n", + " Attempting uninstall: importlib-metadata\n", + " Found existing installation: importlib-metadata 5.0.0\n", + " Uninstalling importlib-metadata-5.0.0:\n", + " Successfully uninstalled importlib-metadata-5.0.0\n", + "Successfully installed Mako-1.2.4 PrettyTable-3.5.0 alembic-1.8.1 autopage-0.5.1 cliff-4.0.0 cmaes-0.9.0 cmd2-2.4.2 colorlog-6.7.0 dask_kubernetes-2022.10.1 google-auth-2.14.1 greenlet-2.0.1 importlib-metadata-4.13.0 iso8601-1.1.0 kopf-1.36.0 kubernetes-25.3.0 kubernetes-asyncio-24.2.2 oauthlib-3.2.2 optuna-3.1.0.dev0 pbr-5.11.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 pyperclip-1.8.2 python-json-logger-2.0.4 requests-oauthlib-1.3.1 rsa-4.9 scipy-1.9.3 sqlalchemy-1.4.44 stevedore-4.1.1\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install dask_kubernetes git+https://github.com/optuna/optuna.git@bc6c05dc655aab7e7a02e91e7306609f2a4524ec" + ] + }, + { + "cell_type": "markdown", + "id": "62a35c4d-07f6-4e42-b865-dbd2e54ed369", + "metadata": {}, + "source": [ + "# Set up Dask cluster" + ] + }, + { + "cell_type": "markdown", + "id": "1ae6a7d9-b943-43e4-b1ed-c6110412ce3f", + "metadata": {}, + "source": [ + "Let us set up a Dask cluster using the `KubeCluster` class. Fill in the following variables, depending on the configuration of your Kubernetes cluster. Here how you can get `n_workers`, assuming that you are using all the nodes in the Kubernetes cluster. Let `N` be the number of nodes.\n", + "* On AWS Elastic Kubernetes Service (EKS): `n_workers = N - 2`\n", + "* On Google Cloud Kubernetes: `n_workers = N - 1`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d88f63a9-3848-4804-8a1a-3cee5dcbe756", + "metadata": {}, + "outputs": [], + "source": [ + "# Choose the same RAPIDS image you used for launching the notebook session\n", + "rapids_image = \"rapidsai/rapidsai-core:22.10-cuda11.5-runtime-ubuntu20.04-py3.9\"\n", + "# Use the number of worker nodes in your Kubernetes cluster.\n", + "n_workers = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "68ad26b9-3280-4087-b945-1cb7f9fb0563", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Unclosed client session\n", + "client_session: \n" + ] + } + ], + "source": [ + "from dask_kubernetes.operator import KubeCluster\n", + "\n", + "cluster = KubeCluster(name=\"rapids-dask\",\n", + " image=rapids_image,\n", + " worker_command=\"dask-cuda-worker\",\n", + " n_workers=n_workers,\n", + " resources={\"limits\": {\"nvidia.com/gpu\": \"1\"}},\n", + " env={\"DISABLE_JUPYTER\": \"true\",\n", + " \"EXTRA_PIP_PACKAGES\":\n", + " \"git+https://github.com/optuna/optuna.git@bc6c05dc655aab7e7a02e91e7306609f2a4524ec\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cb4ff1fc-fea1-495e-97cb-24e7c3957eb7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b686fd59dcb44f2497f48bf0aab60585", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "
\n", + "
\n", + "
\n", + "
\n", + "

KubeCluster

\n", + "

rapids-dask

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Dashboard: /notebook/kubeflow-user-example-com/rapids/proxy/rapids-dask-scheduler.kubeflow-user-example-com:8787/status\n", + " \n", + " Workers: 0\n", + "
\n", + " Total threads: 0\n", + " \n", + " Total memory: 0 B\n", + "
\n", + "\n", + "
\n", + " \n", + "

Scheduler Info

\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

Scheduler

\n", + "

Scheduler-7bc3d0a6-9c3b-493f-b1a4-b7664c136062

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Comm: tcp://10.36.0.20:8786\n", + " \n", + " Workers: 0\n", + "
\n", + " Dashboard: /notebook/kubeflow-user-example-com/rapids/proxy/10.36.0.20:8787/status\n", + " \n", + " Total threads: 0\n", + "
\n", + " Started: Just now\n", + " \n", + " Total memory: 0 B\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "

Workers

\n", + "
\n", + "\n", + " \n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
" + ], + "text/plain": [ + "KubeCluster(rapids-dask, 'tcp://rapids-dask-scheduler.kubeflow-user-example-com:8786', workers=0, threads=0, memory=0 B)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "634fe9f5-e28d-4c24-9c55-19cbd15038c3", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import Client\n", + "\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "id": "e118a6b2-771b-4268-9f77-e753062200c2", + "metadata": {}, + "source": [ + "# Perform hyperparameter optimization with a toy example" + ] + }, + { + "cell_type": "markdown", + "id": "47f6b3dc-9e09-4fc5-b70d-f9882cf2795c", + "metadata": {}, + "source": [ + "Now we can run hyperparameter optimization. The workers will run multiple training jobs in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2e1f3a02-53f4-47d1-86bf-8a8a729e4364", + "metadata": {}, + "outputs": [], + "source": [ + "def objective(trial):\n", + " x = trial.suggest_uniform(\"x\", -10, 10)\n", + " return (x - 2) ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2d45b6de-d092-457b-b8f3-dd202f04484f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_75/1194069379.py:9: ExperimentalWarning: DaskStorage is experimental (supported from v3.1.0). The interface can change in the future.\n", + " dask_storage = optuna.integration.DaskStorage(storage=backend_storage, client=client)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing hyperparameter combinations 0..16\n", + "Testing hyperparameter combinations 16..32\n", + "Testing hyperparameter combinations 32..48\n", + "Testing hyperparameter combinations 48..64\n", + "Testing hyperparameter combinations 64..80\n", + "Testing hyperparameter combinations 80..96\n", + "Testing hyperparameter combinations 96..100\n" + ] + } + ], + "source": [ + "import optuna\n", + "from dask.distributed import wait\n", + "\n", + "# Number of hyperparameter combinations to try in parallel\n", + "n_trials = 100\n", + "\n", + "# Optimize in parallel on your Dask cluster\n", + "backend_storage = optuna.storages.InMemoryStorage()\n", + "dask_storage = optuna.integration.DaskStorage(storage=backend_storage, client=client)\n", + "study = optuna.create_study(direction=\"minimize\", storage=dask_storage)\n", + "\n", + "futures = []\n", + "for i in range(0, n_trials, n_workers * 4):\n", + " iter_range = (i, min([i + n_workers * 4, n_trials]))\n", + " futures.append(\n", + " {\n", + " \"range\": iter_range,\n", + " \"futures\": [\n", + " client.submit(study.optimize, objective, n_trials=1, pure=False)\n", + " for _ in range(*iter_range)\n", + " ]\n", + " }\n", + " )\n", + "for partition in futures:\n", + " iter_range = partition[\"range\"]\n", + " print(f\"Testing hyperparameter combinations {iter_range[0]}..{iter_range[1]}\")\n", + " _ = wait(partition[\"futures\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4a138366-4535-4d3f-885e-6066626588aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'x': 1.9899853370223668}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "study.best_params" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3f9437f8-70ef-4649-9068-8f14ef491141", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.00010029347455557715" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "study.best_value" + ] + }, + { + "cell_type": "markdown", + "id": "ef6bdc73-10fd-4221-abe1-1e04854335e4", + "metadata": {}, + "source": [ + "# Perform hyperparameter optimization with XGBoost GPU algorithm" + ] + }, + { + "cell_type": "markdown", + "id": "bc1ade8a-218f-45ab-bedb-abc4bce5fffc", + "metadata": {}, + "source": [ + "Now let's try optimizing hyperparameters for an XGBoost model." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "43b5d290-7858-4bd5-8d46-81ce4f1b298b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "from sklearn.model_selection import cross_val_score, KFold\n", + "import xgboost as xgb\n", + "from optuna.samplers import RandomSampler\n", + "\n", + "def objective(trial):\n", + " X, y = load_breast_cancer(return_X_y=True)\n", + " params = {\n", + " \"n_estimators\": 10,\n", + " \"verbosity\": 0,\n", + " \"tree_method\": \"gpu_hist\",\n", + " # L2 regularization weight.\n", + " \"lambda\": trial.suggest_float(\"lambda\", 1e-8, 100.0, log=True),\n", + " # L1 regularization weight.\n", + " \"alpha\": trial.suggest_float(\"alpha\", 1e-8, 100.0, log=True),\n", + " # sampling according to each tree.\n", + " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.2, 1.0),\n", + " \"max_depth\": trial.suggest_int(\"max_depth\", 2, 10, step=1),\n", + " # minimum child weight, larger the term more conservative the tree.\n", + " \"min_child_weight\": trial.suggest_float(\"min_child_weight\", 1e-8, 100, log=True),\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-8, 1.0, log=True),\n", + " # defines how selective algorithm is.\n", + " \"gamma\": trial.suggest_float(\"gamma\", 1e-8, 1.0, log=True),\n", + " \"grow_policy\": \"depthwise\",\n", + " \"eval_metric\": \"logloss\"\n", + " }\n", + " clf = xgb.XGBClassifier(**params)\n", + " fold = KFold(n_splits=5, shuffle=True, random_state=0)\n", + " score = cross_val_score(clf, X, y, cv=fold, scoring='neg_log_loss')\n", + " return score.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "56e9d765-0484-46d7-8728-f1aa25dc5f33", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_75/1634478960.py:6: ExperimentalWarning: DaskStorage is experimental (supported from v3.1.0). The interface can change in the future.\n", + " dask_storage = optuna.integration.DaskStorage(storage=backend_storage, client=client)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing hyperparameter combinations 0..16\n", + "Testing hyperparameter combinations 16..32\n", + "Testing hyperparameter combinations 32..48\n", + "Testing hyperparameter combinations 48..64\n", + "Testing hyperparameter combinations 64..80\n", + "Testing hyperparameter combinations 80..96\n", + "Testing hyperparameter combinations 96..112\n", + "Testing hyperparameter combinations 112..128\n", + "Testing hyperparameter combinations 128..144\n", + "Testing hyperparameter combinations 144..160\n", + "Testing hyperparameter combinations 160..176\n", + "Testing hyperparameter combinations 176..192\n", + "Testing hyperparameter combinations 192..208\n", + "Testing hyperparameter combinations 208..224\n", + "Testing hyperparameter combinations 224..240\n", + "Testing hyperparameter combinations 240..250\n" + ] + } + ], + "source": [ + "# Number of hyperparameter combinations to try in parallel\n", + "n_trials = 250\n", + "\n", + "# Optimize in parallel on your Dask cluster\n", + "backend_storage = optuna.storages.InMemoryStorage()\n", + "dask_storage = optuna.integration.DaskStorage(storage=backend_storage, client=client)\n", + "study = optuna.create_study(direction=\"maximize\",\n", + " sampler=RandomSampler(seed=0),\n", + " storage=dask_storage)\n", + "futures = []\n", + "for i in range(0, n_trials, n_workers * 4):\n", + " iter_range = (i, min([i + n_workers * 4, n_trials]))\n", + " futures.append(\n", + " {\n", + " \"range\": iter_range,\n", + " \"futures\": [\n", + " client.submit(study.optimize, objective, n_trials=1, pure=False)\n", + " for _ in range(*iter_range)\n", + " ]\n", + " }\n", + " )\n", + "for partition in futures:\n", + " iter_range = partition[\"range\"]\n", + " print(f\"Testing hyperparameter combinations {iter_range[0]}..{iter_range[1]}\")\n", + " _ = wait(partition[\"futures\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "72630515-f7ad-47ce-b57c-b5c6fdd55faf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'lambda': 1.9471539598103378,\n", + " 'alpha': 1.1141784696858766e-08,\n", + " 'colsample_bytree': 0.7422532294369841,\n", + " 'max_depth': 4,\n", + " 'min_child_weight': 0.2248745054413427,\n", + " 'learning_rate': 0.4983200494234886,\n", + " 'gamma': 9.77293810275356e-07}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "study.best_params" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6abba0f9-0ca1-41d9-ad8d-eda1a31a26f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-0.10351124143719839" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "study.best_value" + ] + }, + { + "cell_type": "markdown", + "id": "889fdbc4-dd40-49a9-a624-39541475db53", + "metadata": {}, + "source": [ + "Let's visualize the progress made by hyperparameter optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e4951b29-a10d-4a7c-afae-e054a78f89b6", + "metadata": {}, + "outputs": [], + "source": [ + "from optuna.visualization.matplotlib import plot_optimization_history, plot_param_importances" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "18f37fa0-a8a6-4211-bfc4-2902661c6651", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_75/3324289224.py:1: ExperimentalWarning: plot_optimization_history is experimental (supported from v2.2.0). The interface can change in the future.\n", + " plot_optimization_history(study)\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_optimization_history(study)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "73a283e5-68c4-41e2-9abe-05af673c3514", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_75/3836449081.py:1: ExperimentalWarning: plot_param_importances is experimental (supported from v2.2.0). The interface can change in the future.\n", + " plot_param_importances(study)\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_param_importances(study)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e341aaa-708c-40dc-ae8e-e0ba96e3848c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]" + }, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/source/index.md b/source/index.md index 1ca268cd..144943be 100644 --- a/source/index.md +++ b/source/index.md @@ -73,10 +73,10 @@ There are many tools to deploy RAPIDS. ```` ````{grid-item-card} -:link: https://github.com/rapidsai/cloud-ml-examples -{fas}`book;sd-text-primary` Cloud ML Examples +:link: examples +{fas}`book;sd-text-primary` Workflow examples ^^^ -See our [example notebooks repo](https://github.com/rapidsai/cloud-ml-examples) with opinionated deployments of RAPIDS to boost machine learning workflows. +For inspiration see our example notebooks with opinionated deployments of RAPIDS to boost machine learning workflows. {bdg-primary}`xgboost` {bdg-primary}`optuna` @@ -116,3 +116,11 @@ guides/index tools/index ``` + +```{toctree} +:maxdepth: 3 +:hidden: +:caption: Examples + +examples/index +```