From ef86ec6e446f49a15f96d8f6d7ff0ed48032b406 Mon Sep 17 00:00:00 2001 From: dpadmanabhan Date: Mon, 7 Oct 2024 17:57:29 -0700 Subject: [PATCH] Adding hpo example using optuna for hnsw_flat and cagra --- notebooks/cuvs_hpo_example.ipynb | 14 - notebooks/faiss_and_cuvs_hpo_example.ipynb | 470 +++++++++++++++++++++ 2 files changed, 470 insertions(+), 14 deletions(-) create mode 100644 notebooks/faiss_and_cuvs_hpo_example.ipynb diff --git a/notebooks/cuvs_hpo_example.ipynb b/notebooks/cuvs_hpo_example.ipynb index d8b11a82c..19846b1a5 100644 --- a/notebooks/cuvs_hpo_example.ipynb +++ b/notebooks/cuvs_hpo_example.ipynb @@ -11,20 +11,6 @@ "Note: This notebook has been tested on Sagemaker Studio with an instance type of ml.g5.12xlarge." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d35868b-93ad-43b4-ae15-59e75aa89e3c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#Install Required Packages\n", - "%mamba install -c conda-forge -c nvidia -c rapidsai-nightly cuvs optuna -y\n", - "%pip install cupy" - ] - }, { "cell_type": "code", "execution_count": 35, diff --git a/notebooks/faiss_and_cuvs_hpo_example.ipynb b/notebooks/faiss_and_cuvs_hpo_example.ipynb new file mode 100644 index 000000000..6fc6f0d7c --- /dev/null +++ b/notebooks/faiss_and_cuvs_hpo_example.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba87966d-4066-42b1-a9a0-7ae894bfb602", + "metadata": {}, + "source": [ + "#### Prerequisites:\n", + "Follow these steps to use RAPIDS 24.10 for running this Jupyter notebook in the SageMaker Studio environment.\n", + "\n", + "##### 1: Create a Conda Environment: Use the following command to create a new conda environment in the terminal named rapids-24.10. This environment will include RAPIDS version 24.10, Python 3.12, and several other packages:\n", + "\n", + "```conda create -n rapids-24.10 -c rapidsai-nightly -c conda-forge -c nvidia rapids=24.10 python=3.12 'cuda-version>=12.0,<=12.5' ipykernel optuna faiss-cpu h5py```\n", + "\n", + "##### 2: Activate the Conda Environment:\n", + "\n", + "```conda activate rapids-24.10```\n", + "\n", + "##### 3: Install the Jupyter Kernel:\n", + "```python -m ipykernel install --user --name cuvs-rapids-24.10 --display-name \"Python (rapids-24.10)\"```\n", + "\n", + "##### 4: Restart the kernel and select the kernel \"Python (rapids-24.10)\" for your jupyter notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "389dcde7-8c4a-4b4f-ae3a-8deccc3f9c22", + "metadata": {}, + "outputs": [], + "source": [ + "import cupy as cp\n", + "import numpy as np\n", + "from cuvs.neighbors import cagra\n", + "import time\n", + "import optuna\n", + "from utils import calc_recall\n", + "import os\n", + "import faiss\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3fa0e374-0765-48ec-9d3f-2d9639594e28", + "metadata": {}, + "outputs": [], + "source": [ + "import tarfile\n", + "\n", + "def extract_tar_dataset(dataset_url, tarfilename, work_dir):\n", + " #wiki-all datasets are in tar format\n", + " if os.path.exists(work_dir + \"/\" + tarfilename):\n", + " print(\"tar file is already downloaded\")\n", + " else:\n", + " urllib.request.urlretrieve(url, work_dir + \"/\" + tarfilename)\n", + " # Open the .tar file\n", + " with tarfile.open(work_dir + \"/\" + tarfilename, 'r') as tar:\n", + " folder_name = tarfilename.split(\".\")[0]\n", + " if os.path.exists(work_dir + \"/\" + folder_name + \"/\"):\n", + " print(\"Files already extracted\")\n", + " return work_dir + \"/\" + folder_name + \"/\"\n", + " # Extract all contents into the specified directory\n", + " extract_path=work_dir + \"/\" +folder_name.split(\".\")[0]\n", + " tar.extractall(extract_path)\n", + " return extract_path\n", + "\n", + "def read_data(file_path, dtype, use_cupy):\n", + " if use_cupy:\n", + " np_lib = cp\n", + " else:\n", + " np_lib = np\n", + " with open(file_path, \"rb\") as f:\n", + " rows,cols = np.fromfile(f, count=2, dtype= np.int32)\n", + " d = np.fromfile(f,count=rows*cols,dtype=dtype).reshape(rows, cols)\n", + " return np_lib.asarray(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c0c1ec2-d1cc-4949-80ef-ee25ecb40abd", + "metadata": {}, + "outputs": [], + "source": [ + "work_dir = os.path.expanduser(\"~/\")\n", + "extracted_path=extract_tar_dataset('https://data.rapids.ai/raft/datasets/wiki_all_1M/wiki_all_1M.tar', 'wiki_all_1M.tar', work_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "24b72c82-d8e7-4d88-9850-866e3b9cdffa", + "metadata": {}, + "source": [ + "### FAISS HNSW Flat" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "8ff499d4-b4a6-4ca7-9cc7-a38444a4d1fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the base vectors from the file and convert them to numpy float32 type\n", + "vectors = read_data(extracted_path + \"/base.1M.fbin\", np.float32, use_cupy=False)\n", + "\n", + "# Read the query vectors from the file and convert them to numpy float32 type\n", + "queries = read_data(extracted_path + \"/queries.fbin\", np.float32, use_cupy=False)\n", + "\n", + "# Read the ground truth neighbors from the file and convert them to numpy int32 type\n", + "gt_neighbors = read_data(extracted_path + \"/groundtruth.1M.neighbors.ibin\", np.int32, use_cupy=False)\n", + "\n", + "#Note: The use_cupy parameter is set to False, indicating that the data conversion should be performed using NumPy (CPU-based).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e66c8b8c-9600-45bd-b74f-17842e86ceed", + "metadata": {}, + "outputs": [], + "source": [ + "#Get the dataset size of database vectors\n", + "dataset_size = vectors.shape[0]\n", + "\n", + "#Get the dimension\n", + "dim = vectors.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8dcf17f7-247a-4e51-b7c0-a09ddff8d728", + "metadata": {}, + "outputs": [], + "source": [ + "def multi_objective_hnsw_flat(trial):\n", + " \"\"\"\n", + " This function performs a multi-objective optimization for HNSW flat using the Optuna library. It optimizes the parameters 'ef_construction' and 'ef_search' \n", + " to balance the trade-offs between build time, search latency, and recall.\n", + "\n", + " Parameters:\n", + " trial (optuna.trial.Trial): A trial object that suggests values for hyperparameters.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing build time in seconds, search latency in milliseconds, and recall value, each rounded to 4 decimal places.\n", + " \"\"\"\n", + " ef_construction_val = trial.suggest_categorical('ef_construction_val', [32, 64, 128, 256])\n", + " ef_search_val = trial.suggest_categorical('ef_search_val', [16, 32, 64, 128]) # depth of layers explored during search\n", + " \n", + " # set HNSW index parameters\n", + " ef_construction = ef_construction_val\n", + "\n", + " start_build_time = time.time()\n", + " index = faiss.IndexHNSWFlat(dim, M=32)\n", + " # set efConstruction and efSearch parameters\n", + " index.hnsw.efConstruction = ef_construction_val\n", + " index.hnsw.efSearch = ef_search_val\n", + " # add data to index\n", + " index.add(vectors)\n", + " build_time_in_secs = time.time() - start_build_time\n", + " \n", + " # Perform the search\n", + " start_search_time = time.time()\n", + " distances, indices = index.search(queries, k=10)\n", + " search_time = time.time() - start_search_time\n", + " \n", + " latency_in_ms = (search_time * 1000)/queries.shape[0]\n", + " \n", + " recall = calc_recall(indices, gt_neighbors, use_cupy=False)\n", + " if recall < 0.80:\n", + " raise optuna.TrialPruned()\n", + " return round(build_time_in_secs,4), round(latency_in_ms,4), round(recall,4)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c2c0874c-2b61-4aad-9549-c8856cb2587f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2024-10-07 21:01:22,471] A new study created in memory with name: no-name-8e8e21c9-de27-4577-9e28-9eabc07979d5\n", + "[I 2024-10-07 21:02:12,136] Trial 0 finished with values: [47.7321, 0.1486, 0.9752] and parameters: {'ef_construction_val': 32, 'ef_search_val': 128}. \n", + "[I 2024-10-07 21:03:04,159] Trial 1 finished with values: [51.2019, 0.0375, 0.8533] and parameters: {'ef_construction_val': 64, 'ef_search_val': 32}. \n", + "[I 2024-10-07 21:04:47,146] Trial 2 pruned. \n", + "[I 2024-10-07 21:05:35,595] Trial 3 finished with values: [47.4594, 0.0551, 0.8967] and parameters: {'ef_construction_val': 32, 'ef_search_val': 32}. \n", + "[I 2024-10-07 21:08:56,474] Trial 4 finished with values: [199.6245, 0.0816, 0.9597] and parameters: {'ef_construction_val': 256, 'ef_search_val': 64}. \n", + "[I 2024-10-07 21:09:48,753] Trial 5 finished with values: [51.2019, 0.0636, 0.9165] and parameters: {'ef_construction_val': 64, 'ef_search_val': 64}. \n", + "[I 2024-10-07 21:10:40,211] Trial 6 pruned. \n", + "[I 2024-10-07 21:11:29,403] Trial 7 finished with values: [47.8772, 0.0874, 0.948] and parameters: {'ef_construction_val': 32, 'ef_search_val': 64}. \n", + "[I 2024-10-07 21:12:19,367] Trial 8 finished with values: [48.6612, 0.0864, 0.9479] and parameters: {'ef_construction_val': 32, 'ef_search_val': 64}. \n", + "[I 2024-10-07 21:13:11,123] Trial 9 pruned. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8h 57min 43s, sys: 31.3 s, total: 8h 58min 15s\n", + "Wall time: 11min 48s\n" + ] + } + ], + "source": [ + "%%time \n", + "hnsw_flat_study = optuna.create_study(directions=['minimize', 'minimize', 'maximize'])\n", + "hnsw_flat_study.optimize(multi_objective_hnsw_flat, n_trials=10)" + ] + }, + { + "cell_type": "markdown", + "id": "24ca212c-cc24-473f-8fd0-e90c81a60344", + "metadata": {}, + "source": [ + "#### It took about 11 mins to optimize HNSW_flat using optuna with 10 trials" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c0df6e4b-2391-44ee-8d1e-f1ce21c9c20f", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_best_trials(trials, metric_indices=[0, 1, 2], metric_labels=[\"lowest build time in secs\", \"lowest latency in ms\", \"highest recall\"]):\n", + " \"\"\"\n", + " Summarizes the best trials from a list of trials based on specified metrics.\n", + "\n", + " Parameters:\n", + " trials (list): A list of trial objects, where each trial has attributes 'number', 'params', and 'values'.\n", + " metric_indices (list): A list of indices indicating which metrics to consider. Default is [0, 1, 2].\n", + " metric_labels (list): A list of labels describing each metric. Default is [\"lowest build time in secs\", \"lowest latency in ms\", \"highest recall\"].\n", + "\n", + " Functionality:\n", + " - Iterates over the provided metric indices and labels.\n", + " - For each metric, finds the best trial:\n", + " - If the metric index is 0 or 1, it considers lower values as better (minimization).\n", + " - For other indices, it considers higher values as better (maximization).\n", + " - Prints a summary of the best trial for each metric, including trial number, parameters, and values.\n", + " \"\"\"\n", + " for index, label in zip(metric_indices, metric_labels):\n", + " if index in (0, 1):\n", + " best_trial = min(trials, key=lambda t: t.values[index])\n", + " else:\n", + " best_trial = max(trials, key=lambda t: t.values[index])\n", + " print(f\"Trial with {label}:\")\n", + " print(f\"\\tnumber: {best_trial.number}\")\n", + " print(f\"\\tparams: {best_trial.params}\")\n", + " print(f\"\\tvalues: {best_trial.values}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "fc8fe0e7-3a0a-48d2-91ff-177c0f6f4a43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trial with lowest build time in secs:\n", + "\tnumber: 3\n", + "\tparams: {'ef_construction_val': 32, 'ef_search_val': 32}\n", + "\tvalues: [47.4594, 0.0551, 0.8967]\n", + "Trial with lowest latency in ms:\n", + "\tnumber: 1\n", + "\tparams: {'ef_construction_val': 64, 'ef_search_val': 32}\n", + "\tvalues: [51.2019, 0.0375, 0.8533]\n", + "Trial with highest recall:\n", + "\tnumber: 0\n", + "\tparams: {'ef_construction_val': 32, 'ef_search_val': 128}\n", + "\tvalues: [47.7321, 0.1486, 0.9752]\n" + ] + } + ], + "source": [ + "summarize_best_trials(hnsw_flat_study.best_trials)" + ] + }, + { + "cell_type": "markdown", + "id": "e8bfd154-ddc9-4ff6-bd48-5e4bc7b1c35f", + "metadata": {}, + "source": [ + "### cuVS Cagra" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "74f10952-f520-4c5b-b759-1f6c1b1d5be0", + "metadata": {}, + "outputs": [], + "source": [ + "vectors= read_data(extracted_path + \"/base.1M.fbin\",np.float32, use_cupy=True)\n", + "queries = read_data(extracted_path + \"/queries.fbin\",np.float32, use_cupy=True)\n", + "gt_neighbors = read_data(extracted_path + \"/groundtruth.1M.neighbors.ibin\",np.int32, use_cupy=True)\n", + "\n", + "# Here, use_cupy=True indicates that the data conversion should be performed using CuPy (GPU-based)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b8065b81-0430-462f-a54a-2f01b7cc40b2", + "metadata": {}, + "outputs": [], + "source": [ + "#Get the dataset size of database vectors\n", + "dataset_size = vectors.shape[0]\n", + "dim = vectors.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "01a52d81-fcac-40b6-ae1d-28e2e8c23c80", + "metadata": {}, + "outputs": [], + "source": [ + "def multi_objective_cagra(trial):\n", + " \"\"\"\n", + " This function performs a multi-objective optimization for cuvs cagra using the Optuna library. It optimizes the parameters 'intermediate_graph_degree', 'graph_degree' \n", + " and 'itopk_size' to balance the trade-offs between build time, search latency, and recall.\n", + "\n", + " Parameters:\n", + " trial (optuna.trial.Trial): A trial object that suggests values for hyperparameters.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing build time in seconds, search latency in milliseconds, and recall value, each rounded to 4 decimal places.\n", + " \"\"\"\n", + " # Suggest values for build parameters\n", + " intermediate_graph_degree = trial.suggest_categorical('intermediate_graph_degree', [64, 128, 256])\n", + " graph_degree = trial.suggest_categorical('graph_degree', [32, 64])\n", + " \n", + " # Suggest an integer for the number of probes\n", + " itopk_size = trial.suggest_categorical('itopk_size', [16, 32, 64, 128])\n", + "\n", + " build_params = cagra.IndexParams(\n", + " intermediate_graph_degree=intermediate_graph_degree,\n", + " graph_degree=graph_degree,\n", + " build_algo=\"nn_descent\"\n", + " )\n", + "\n", + " start_build_time = time.time()\n", + " cagra_index = cagra.build(build_params, vectors)\n", + " build_time_in_secs = time.time() - start_build_time\n", + "\n", + " # Configure search parameters\n", + " search_params = cagra.SearchParams(itopk_size=itopk_size)\n", + "\n", + " # perform search and refine to increase recall/accuracy\n", + " start_search_time = time.time()\n", + " distances, indices = cagra.search(search_params, cagra_index, queries, k=10)\n", + " search_time = time.time() - start_search_time\n", + "\n", + " latency_in_ms = (search_time * 1000)/queries.shape[0]\n", + "\n", + " recall = calc_recall(indices, gt_neighbors, use_cupy=True)\n", + "\n", + " if recall < 0.80:\n", + " raise optuna.TrialPruned()\n", + "\n", + " return round(build_time_in_secs,4), round(latency_in_ms,4), round(recall,4)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "07868991-7b67-48d0-86cf-808e79476c57", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2024-10-07 20:46:55,995] A new study created in memory with name: no-name-ef9a66c8-266c-47b0-85aa-09edfe875695\n", + "[I 2024-10-07 20:47:17,224] Trial 0 finished with values: [15.1621, 0.0525, 0.9925] and parameters: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 128}. \n", + "[I 2024-10-07 20:47:37,912] Trial 1 finished with values: [15.1199, 0.0121, 0.9483] and parameters: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 32}. \n", + "[I 2024-10-07 20:48:00,553] Trial 2 finished with values: [17.083, 0.0061, 0.8127] and parameters: {'intermediate_graph_degree': 128, 'graph_degree': 32, 'itopk_size': 32}. \n", + "[I 2024-10-07 20:48:37,587] Trial 3 pruned. \n", + "[I 2024-10-07 20:49:00,817] Trial 4 finished with values: [17.5252, 0.0235, 0.9778] and parameters: {'intermediate_graph_degree': 128, 'graph_degree': 64, 'itopk_size': 64}. \n", + "[I 2024-10-07 20:49:37,553] Trial 5 pruned. \n", + "[I 2024-10-07 20:49:58,935] Trial 6 finished with values: [15.1299, 0.041, 0.9927] and parameters: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 128}. \n", + "[I 2024-10-07 20:50:19,638] Trial 7 finished with values: [15.1149, 0.0059, 0.8532] and parameters: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 16}. \n", + "[I 2024-10-07 20:50:42,297] Trial 8 finished with values: [17.0821, 0.0061, 0.8139] and parameters: {'intermediate_graph_degree': 128, 'graph_degree': 32, 'itopk_size': 32}. \n", + "[I 2024-10-07 20:51:02,956] Trial 9 finished with values: [15.1185, 0.006, 0.852] and parameters: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 16}. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 19min 17s, sys: 1min 45s, total: 21min 3s\n", + "Wall time: 4min 6s\n" + ] + } + ], + "source": [ + "%%time \n", + "cagra_study = optuna.create_study(directions=['minimize', 'minimize', 'maximize'])\n", + "cagra_study.optimize(multi_objective_cagra, n_trials=10)" + ] + }, + { + "cell_type": "markdown", + "id": "571b829f-1106-4a25-b39a-223d4838e1fc", + "metadata": {}, + "source": [ + "#### It took about 4 mins to optimize cagra using optuna with 10 trials" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9f2ce6f9-7021-4e9b-833b-8314ace4c313", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trial with lowest build time in secs:\n", + "\tnumber: 7\n", + "\tparams: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 16}\n", + "\tvalues: [15.1149, 0.0059, 0.8532]\n", + "Trial with lowest latency in ms:\n", + "\tnumber: 7\n", + "\tparams: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 16}\n", + "\tvalues: [15.1149, 0.0059, 0.8532]\n", + "Trial with highest recall:\n", + "\tnumber: 6\n", + "\tparams: {'intermediate_graph_degree': 64, 'graph_degree': 64, 'itopk_size': 128}\n", + "\tvalues: [15.1299, 0.041, 0.9927]\n" + ] + } + ], + "source": [ + "summarize_best_trials(cagra_study.best_trials)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (cuvs-rapids-24.10)", + "language": "python", + "name": "cuvs-rapids-24.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}