From 07da2504586ffa35dfa0de826688e060053ed257 Mon Sep 17 00:00:00 2001
From: Luis Lopez <luseloso11@gmail.com>
Date: Fri, 7 Feb 2025 12:12:20 -0500
Subject: [PATCH] feat: add notebook for BQML remote endpoint Blog (#1704)

---------

Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
Co-authored-by: Holt Skinner <holtskinner@google.com>
---
 .github/actions/spelling/allow.txt            |   2 +
 open-models/README.md                         |   1 +
 .../bigquery_ml_llama_inference.ipynb         | 922 ++++++++++++++++++
 3 files changed, 925 insertions(+)
 create mode 100644 open-models/use-cases/bigquery_ml_llama_inference.ipynb
diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
index 30d3f84f78d..4c7e1a3fd72 100644
--- a/.github/actions/spelling/allow.txt
+++ b/.github/actions/spelling/allow.txt
@@ -829,6 +829,7 @@ loghub
 logparser
 logprobs
 lolcat
+loras
 lparam
 lru
 lsb
@@ -927,6 +928,7 @@ openai
 openfda
 opsz
 osm
+oss
 osx
 outdir
 outro
diff --git a/open-models/README.md b/open-models/README.md
index 86dc1bd0614..ae4e03f2e29 100644
--- a/open-models/README.md
+++ b/open-models/README.md
@@ -21,5 +21,6 @@ This repository contains examples for deploying and fine-tuning open source mode
 
 ### Use cases
 
+- [use-cases/bigquery_ml_llama_inference.ipynb](./use-cases/bigquery_ml_llama_inference.ipynb) - This notebook showcases a simple end-to-end process for extracting entities and performing data analytics using BigQuery in conjunction with an open-source text-generation Large Language Model (LLM). We use Meta's Llama 3.3 70B model as an example.
 - [use-cases/cloud_run_ollama_gemma2_rag_qa.ipynb](./use-cases/cloud_run_ollama_gemma2_rag_qa.ipynb) - This notebooks provides steps and code to deploy an open source RAG pipeline to Cloud Run using Ollama and the Gemma 2 model.
 - [use-cases/guess_app.ipynb](./use-cases/guess_app.ipynb) - This notebook shows how to build a "Guess Who or What" app using FLUX and Gemini.
diff --git a/open-models/use-cases/bigquery_ml_llama_inference.ipynb b/open-models/use-cases/bigquery_ml_llama_inference.ipynb
new file mode 100644
index 00000000000..800f6b4f6f0
--- /dev/null
+++ b/open-models/use-cases/bigquery_ml_llama_inference.ipynb
@@ -0,0 +1,922 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ur8xi4C7S06n"
+      },
+      "outputs": [],
+      "source": [
+        "# Copyright 2025 Google LLC\n",
+        "#\n",
+        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JAPoU8Sm5E6e"
+      },
+      "source": [
+        "# Use Any OSS Gen AI Model Against Your BigQuery Data\n",
+        "\n",
+        "<table align=\"left\">\n",
+        "  <td style=\"text-align: center\">\n",
+        "    <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/open-models/use-cases/bigquery_ml_llama_inference.ipynb\">\n",
+        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
+        "    </a>\n",
+        "  </td>\n",
+        "  <td style=\"text-align: center\">\n",
+        "    <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fuse-cases%2Fbigquery_ml_llama_inference.ipynb\">\n",
+        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
+        "    </a>\n",
+        "  </td>\n",
+        "  <td style=\"text-align: center\">\n",
+        "    <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\">\n",
+        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
+        "    </a>\n",
+        "  </td>\n",
+        "  <td style=\"text-align: center\">\n",
+        "    <a href=\"https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\">\n",
+        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/bigquery/v1/32px.svg\" alt=\"BigQuery Studio logo\"><br> Open in BigQuery Studio\n",
+        "    </a>\n",
+        "  </td>\n",
+        "  <td style=\"text-align: center\">\n",
+        "    <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\">\n",
+        "      <img width=\"32px\" src=\"https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
+        "    </a>\n",
+        "  </td>\n",
+        "</table>\n",
+        "\n",
+        "<div style=\"clear: both;\"></div>\n",
+        "\n",
+        "<b>Share to:</b>\n",
+        "\n",
+        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\" target=\"_blank\">\n",
+        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
+        "</a>\n",
+        "\n",
+        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\" target=\"_blank\">\n",
+        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
+        "</a>\n",
+        "\n",
+        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\" target=\"_blank\">\n",
+        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg\" alt=\"X logo\">\n",
+        "</a>\n",
+        "\n",
+        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\" target=\"_blank\">\n",
+        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
+        "</a>\n",
+        "\n",
+        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/use-cases/bigquery_ml_llama_inference.ipynb\" target=\"_blank\">\n",
+        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
+        "</a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "84f0f73a0f76"
+      },
+      "source": [
+        "| | |\n",
+        "|-|-|\n",
+        "| Author(s) | [Jasper Xu](https://github.com/ZehaoXU), [Luis Lopez](https://github.com/luseloso) |"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tvgnzT1CKxrO"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "This notebook showcases a simple end-to-end process for extracting entities and performing data analytics using BigQuery in conjunction with an open-source text-generation Large Language Model (LLM). We use Meta's Llama 3.3 70B model as an example, and the process involves:\n",
+        "\n",
+        "* Deploy the Llama 3.3 70B model on Vertex AI.\n",
+        "* Configure the necessary setup, including downloading sample data and enabling BigQuery access to Vertex AI.\n",
+        "* Create a [remote model](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-remote-model-open) in BigQuery against the Llama 3.3 70B\n",
+        "* Employ the [ML.GENERATE_TEXT](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-text) function to extract structured information from medical transcripts.\n",
+        "* Analyzing the resulting structured data to gain insights.\n",
+        "\n",
+        "### Costs\n",
+        "\n",
+        "This tutorial uses billable components of Google Cloud:\n",
+        "\n",
+        "* Vertex AI\n",
+        "* Cloud Storage\n",
+        "* BigQuery\n",
+        "\n",
+        "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dmWOrTJ3gx13"
+      },
+      "source": [
+        "## Authenticate your notebook environment (Colab only)\n",
+        "\n",
+        "If you're running this notebook on Google Colab, run the cell below to authenticate your environment."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NyKGtVQjgx13"
+      },
+      "outputs": [],
+      "source": [
+        "import sys\n",
+        "\n",
+        "if \"google.colab\" in sys.modules:\n",
+        "    from google.colab import auth\n",
+        "\n",
+        "    auth.authenticate_user()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "61RBz8LLbxCR"
+      },
+      "source": [
+        "## Deploy the Llama 3.3 70B Model on Vertex AI"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "No17Cw5hgx12"
+      },
+      "source": [
+        "#### Request H100 quota\n",
+        "\n",
+        "By default, the quota for H100 deployment `Custom model serving per region` is 0. If you prefer to use H100 for higher throughput (recommended), you need to request for H100 quota following the instructions at [\"Request a higher quota\"](https://cloud.google.com/docs/quota/view-manage#requesting_higher_quota). We recommend starting with a small quota request, for example setting it to 8.\n",
+        "\n",
+        "If you want to run predictions with A100 80GB or H100 GPUs, we recommend using the regions listed below. \n",
+        "\n",
+        "| Machine Type | Accelerator Type | Recommended Regions |\n",
+        "| --- | --- | --- |\n",
+        "| a2-ultragpu-1g | 1 NVIDIA_A100_80GB | us-central1, us-east4, europe-west4, asia-southeast1, us-east4 |\n",
+        "| a3-highgpu-2g | 2 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |\n",
+        "| a3-highgpu-4g | 4 NVIDIA_H100_80GB | us-west1, asia-southeast1, europe-west4 |\n",
+        "| a3-highgpu-8g | 8 NVIDIA_H100_80GB | us-central1, us-east5, europe-west4, us-west1, asia-southeast1 |\n",
+        "\n",
+        "**NOTE:** Make sure you have associated quota in selected regions. Click the links to see your current quota for each GPU type: [Nvidia A100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_a100_80gb_gpus), [Nvidia H100 80GB](https://console.cloud.google.com/iam-admin/quotas?metric=aiplatform.googleapis.com%2Fcustom_model_serving_nvidia_h100_gpus).\n",
+        "\n",
+        "Alternatively, you can choose to use Nvidia L4 GPUs for this deployment.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ABhBcw0bloIu"
+      },
+      "source": [
+        "#### Setup Google Cloud project\n",
+        "\n",
+        "1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n",
+        "\n",
+        "\n",
+        "2. Point the code to your Project ID. \n",
+        "\n",
+        "\n",
+        "3. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\").\n",
+        "\n",
+        "\n",
+        "4. Set your preferred region.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1f81f6289064"
+      },
+      "source": [
+        "**Note:** You do not need to fill out the `PROJECT_ID`, `BUCKET_URI`, or `REGION` as we will pick these values from `os.environ[\"GOOGLE_CLOUD_PROJECT\"]`, `os.environ[\"GOOGLE_CLOUD_REGION\"]` and the code will create a temporary bucket. \n",
+        "\n",
+        "If you would like to specify specific values, replace the empty strings below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fda9a1639038"
+      },
+      "outputs": [],
+      "source": [
+        "PROJECT_ID = \"\"  # @param {type:\"string\"}\n",
+        "BUCKET_URI = \"gs://\"  # @param {type:\"string\"}\n",
+        "REGION = \"\"  # @param {type:\"string\"}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SPdyTYbDjeHw"
+      },
+      "outputs": [],
+      "source": [
+        "# Import the necessary packages\n",
+        "\n",
+        "# Upgrade Vertex AI SDK.\n",
+        "% pip install --upgrade --quiet 'google-cloud-aiplatform>=1.64.0'\n",
+        "! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git\n",
+        "\n",
+        "import datetime\n",
+        "import importlib\n",
+        "import os\n",
+        "import uuid\n",
+        "\n",
+        "from google.cloud import aiplatform\n",
+        "\n",
+        "common_util = importlib.import_module(\n",
+        "    \"vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util\"\n",
+        ")\n",
+        "\n",
+        "models, endpoints = {}, {}\n",
+        "\n",
+        "# Get the default cloud project id.\n",
+        "if not PROJECT_ID:\n",
+        "    PROJECT_ID = os.environ[\"GOOGLE_CLOUD_PROJECT\"]\n",
+        "\n",
+        "# Get the default region for launching jobs.\n",
+        "if not REGION:\n",
+        "    REGION = os.environ[\"GOOGLE_CLOUD_REGION\"]\n",
+        "\n",
+        "# Enable the Vertex AI API and Compute Engine API, if not already.\n",
+        "print(\"Enabling Vertex AI API and Compute Engine API.\")\n",
+        "! gcloud services enable aiplatform.googleapis.com compute.googleapis.com --project {PROJECT_ID}\n",
+        "\n",
+        "# Cloud Storage bucket for storing the experiment artifacts.\n",
+        "# A unique GCS bucket will be created for the purpose of this notebook. If you\n",
+        "# prefer using your own GCS bucket, change the value yourself below.\n",
+        "now = datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+        "BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
+        "\n",
+        "if BUCKET_URI is None or BUCKET_URI.strip() == \"\" or BUCKET_URI == \"gs://\":\n",
+        "    BUCKET_URI = f\"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}\"\n",
+        "    BUCKET_NAME = \"/\".join(BUCKET_URI.split(\"/\")[:3])\n",
+        "    ! gsutil mb -l {REGION} {BUCKET_URI}\n",
+        "else:\n",
+        "    assert BUCKET_URI.startswith(\"gs://\"), \"BUCKET_URI must start with `gs://`.\"\n",
+        "    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep \"Location constraint:\" | sed \"s/Location constraint://\"\n",
+        "    bucket_region = shell_output[0].strip().lower()\n",
+        "    if bucket_region != REGION:\n",
+        "        raise ValueError(\n",
+        "            \"Bucket region %s is different from notebook region %s\"\n",
+        "            % (bucket_region, REGION)\n",
+        "        )\n",
+        "print(f\"Using this GCS Bucket: {BUCKET_URI}\")\n",
+        "\n",
+        "STAGING_BUCKET = os.path.join(BUCKET_URI, \"temporal\")\n",
+        "MODEL_BUCKET = os.path.join(BUCKET_URI, \"llama3-3\")\n",
+        "\n",
+        "\n",
+        "# Initialize Vertex AI API.\n",
+        "print(\"Initializing Vertex AI API.\")\n",
+        "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)\n",
+        "\n",
+        "# Gets the default SERVICE_ACCOUNT.\n",
+        "shell_output = ! gcloud projects describe $PROJECT_ID\n",
+        "project_number = shell_output[-1].split(\":\")[1].strip().replace(\"'\", \"\")\n",
+        "SERVICE_ACCOUNT = f\"{project_number}-compute@developer.gserviceaccount.com\"\n",
+        "print(\"Using this default Service Account:\", SERVICE_ACCOUNT)\n",
+        "\n",
+        "\n",
+        "# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket\n",
+        "! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME\n",
+        "\n",
+        "! gcloud config set project $PROJECT_ID\n",
+        "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/storage.admin\"\n",
+        "! gcloud projects add-iam-policy-binding --no-user-output-enabled {PROJECT_ID} --member=serviceAccount:{SERVICE_ACCOUNT} --role=\"roles/aiplatform.user\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c55789278f88"
+      },
+      "source": [
+        "# Access Llama 3.3 models on Vertex AI for serving\n",
+        "The original models from Meta are converted into the Hugging Face format for serving in Vertex AI.\n",
+        "Accept the model agreement to access the models:\n",
+        "1. Open the [Llama 3.3 model card](https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama3-3) from [Vertex AI Model Garden](https://cloud.google.com/model-garden).\n",
+        "2. Review and accept the agreement in the pop-up window on the model card page. If you have previously accepted the model agreement, there will not be a pop-up window on the model card page and this step is not needed.\n",
+        "3. After accepting the agreement of Llama 3.3, a `gs://` URI containing Llama 3.3 models will be shared.\n",
+        "4. Paste the URI in the `VERTEX_AI_MODEL_GARDEN_LLAMA_3_3` field below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7b8ae65ff28c"
+      },
+      "outputs": [],
+      "source": [
+        "VERTEX_AI_MODEL_GARDEN_LLAMA_3_3 = \"gs://vertex-model-garden-public-us/llama3.3\"  # @param {type:\"string\", isTemplate:true}\n",
+        "assert (\n",
+        "    VERTEX_AI_MODEL_GARDEN_LLAMA_3_3\n",
+        "), \"Click the agreement of Llama 3.3 in Vertex AI Model Garden, and get the GCS path of Llama 3.3 model artifacts.\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KCsVQ2qolxSD"
+      },
+      "source": [
+        "### Deploy Llama 3.3 70B Instruct with vLLM\n",
+        "\n",
+        "The following step will register the model in Vertex AI and deploy it to an endpoint for serving. \n",
+        "\n",
+        "\n",
+        "**NOTE:** It usually takes 15~30 minutes to complete. \n",
+        "\n",
+        "To monitor deployment progress, navigate to [Vertex AI](https://console.cloud.google.com/vertex-ai) -> Online Prediction -> View logs for the Llama 3.3 endpoint."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bVxM9XiEjt1I"
+      },
+      "outputs": [],
+      "source": [
+        "# @markdown This section uploads Llama 3.3 to Model Registry and deploys it to a Vertex AI Endpoint. It takes ~30 minutes.\n",
+        "\n",
+        "# @markdown The serving efficiency of L4 GPUs is inferior to that of H100 GPUs, but L4 GPUs are nevertheless good serving solutions if you do not have H100 quota.\n",
+        "\n",
+        "# @markdown H100 is hard to get for now. It's recommended to use the deployment button in the model card. You can still try to deploy H100 endpoint through the notebook, but there is a chance that resource is not available.\n",
+        "\n",
+        "# @markdown Set the model to deploy.\n",
+        "\n",
+        "base_model_name = (\n",
+        "    \"Llama-3.3-70B-Instruct\"  # @param [\"Llama-3.3-70B-Instruct\"] {isTemplate:true}\n",
+        ")\n",
+        "model_id = os.path.join(VERTEX_AI_MODEL_GARDEN_LLAMA_3_3, base_model_name)\n",
+        "ENABLE_DYNAMIC_LORA = False\n",
+        "hf_model_id = \"meta-llama/\" + base_model_name\n",
+        "\n",
+        "accelerator_type = \"NVIDIA_L4\"  # @param [\"NVIDIA_H100_80GB\", \"NVIDIA_L4\"]\n",
+        "\n",
+        "# The pre-built serving docker images.\n",
+        "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241001_0916_RC00\"\n",
+        "\n",
+        "use_dedicated_endpoint = False\n",
+        "# @markdown Find Vertex AI prediction supported accelerators and regions at https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.\n",
+        "if accelerator_type == \"NVIDIA_L4\":\n",
+        "    machine_type = \"g2-standard-96\"\n",
+        "    accelerator_count = 8\n",
+        "    max_loras = 1\n",
+        "elif accelerator_type == \"NVIDIA_H100_80GB\":\n",
+        "    machine_type = \"a3-highgpu-4g\"\n",
+        "    accelerator_count = 4\n",
+        "    max_loras = 1\n",
+        "else:\n",
+        "    raise ValueError(\n",
+        "        f\"Recommended GPU setting not found for: {accelerator_type} and {base_model_name}.\"\n",
+        "    )\n",
+        "\n",
+        "common_util.check_quota(\n",
+        "    project_id=PROJECT_ID,\n",
+        "    region=REGION,\n",
+        "    accelerator_type=accelerator_type,\n",
+        "    accelerator_count=accelerator_count,\n",
+        "    is_for_training=False,\n",
+        ")\n",
+        "\n",
+        "gpu_memory_utilization = 0.95\n",
+        "max_model_len = 8192  # Maximum context length.\n",
+        "\n",
+        "\n",
+        "def deploy_model_vllm(\n",
+        "    model_name: str,\n",
+        "    model_id: str,\n",
+        "    publisher: str,\n",
+        "    publisher_model_id: str,\n",
+        "    service_account: str,\n",
+        "    base_model_id: str = None,\n",
+        "    machine_type: str = \"g2-standard-8\",\n",
+        "    accelerator_type: str = \"NVIDIA_L4\",\n",
+        "    accelerator_count: int = 1,\n",
+        "    gpu_memory_utilization: float = 0.9,\n",
+        "    max_model_len: int = 4096,\n",
+        "    dtype: str = \"auto\",\n",
+        "    enable_trust_remote_code: bool = False,\n",
+        "    enforce_eager: bool = False,\n",
+        "    enable_lora: bool = False,\n",
+        "    enable_chunked_prefill: bool = False,\n",
+        "    enable_prefix_cache: bool = False,\n",
+        "    host_prefix_kv_cache_utilization_target: float = 0.0,\n",
+        "    max_loras: int = 1,\n",
+        "    max_cpu_loras: int = 8,\n",
+        "    use_dedicated_endpoint: bool = False,\n",
+        "    max_num_seqs: int = 256,\n",
+        "    model_type: str = None,\n",
+        ") -> tuple[aiplatform.Model, aiplatform.Endpoint]:\n",
+        "    \"\"\"Deploys trained models with vLLM into Vertex AI.\"\"\"\n",
+        "    endpoint = aiplatform.Endpoint.create(\n",
+        "        display_name=f\"{model_name}-endpoint\",\n",
+        "        dedicated_endpoint_enabled=use_dedicated_endpoint,\n",
+        "    )\n",
+        "\n",
+        "    if not base_model_id:\n",
+        "        base_model_id = model_id\n",
+        "\n",
+        "    # See https://docs.vllm.ai/en/latest/models/engine_args.html for a list of possible arguments with descriptions.\n",
+        "    vllm_args = [\n",
+        "        \"python\",\n",
+        "        \"-m\",\n",
+        "        \"vllm.entrypoints.api_server\",\n",
+        "        \"--host=0.0.0.0\",\n",
+        "        \"--port=8080\",\n",
+        "        f\"--model={model_id}\",\n",
+        "        f\"--tensor-parallel-size={accelerator_count}\",\n",
+        "        \"--swap-space=16\",\n",
+        "        f\"--gpu-memory-utilization={gpu_memory_utilization}\",\n",
+        "        f\"--max-model-len={max_model_len}\",\n",
+        "        f\"--dtype={dtype}\",\n",
+        "        f\"--max-loras={max_loras}\",\n",
+        "        f\"--max-cpu-loras={max_cpu_loras}\",\n",
+        "        f\"--max-num-seqs={max_num_seqs}\",\n",
+        "        \"--disable-log-stats\",\n",
+        "    ]\n",
+        "\n",
+        "    if enable_trust_remote_code:\n",
+        "        vllm_args.append(\"--trust-remote-code\")\n",
+        "\n",
+        "    if enforce_eager:\n",
+        "        vllm_args.append(\"--enforce-eager\")\n",
+        "\n",
+        "    if enable_lora:\n",
+        "        vllm_args.append(\"--enable-lora\")\n",
+        "\n",
+        "    if enable_chunked_prefill:\n",
+        "        vllm_args.append(\"--enable-chunked-prefill\")\n",
+        "\n",
+        "    if enable_prefix_cache:\n",
+        "        vllm_args.append(\"--enable-prefix-caching\")\n",
+        "\n",
+        "    if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
+        "        vllm_args.append(\n",
+        "            f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
+        "        )\n",
+        "\n",
+        "    if model_type:\n",
+        "        vllm_args.append(f\"--model-type={model_type}\")\n",
+        "\n",
+        "    env_vars = {\n",
+        "        \"MODEL_ID\": base_model_id,\n",
+        "        \"DEPLOY_SOURCE\": \"notebook\",\n",
+        "    }\n",
+        "\n",
+        "    # HF_TOKEN is not a compulsory field and may not be defined.\n",
+        "    try:\n",
+        "        if HF_TOKEN:\n",
+        "            env_vars[\"HF_TOKEN\"] = HF_TOKEN\n",
+        "    except NameError:\n",
+        "        pass\n",
+        "\n",
+        "    model = aiplatform.Model.upload(\n",
+        "        display_name=model_name,\n",
+        "        serving_container_image_uri=VLLM_DOCKER_URI,\n",
+        "        serving_container_args=vllm_args,\n",
+        "        serving_container_ports=[8080],\n",
+        "        serving_container_predict_route=\"/generate\",\n",
+        "        serving_container_health_route=\"/ping\",\n",
+        "        serving_container_environment_variables=env_vars,\n",
+        "        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB\n",
+        "        serving_container_deployment_timeout=7200,\n",
+        "        model_garden_source_model_name=(\n",
+        "            f\"publishers/{publisher}/models/{publisher_model_id}\"\n",
+        "        ),\n",
+        "    )\n",
+        "    print(\n",
+        "        f\"Deploying {model_name} on {machine_type} with {accelerator_count} {accelerator_type} GPU(s).\"\n",
+        "    )\n",
+        "    model.deploy(\n",
+        "        endpoint=endpoint,\n",
+        "        machine_type=machine_type,\n",
+        "        accelerator_type=accelerator_type,\n",
+        "        accelerator_count=accelerator_count,\n",
+        "        deploy_request_timeout=1800,\n",
+        "        service_account=service_account,\n",
+        "        system_labels={\n",
+        "            \"NOTEBOOK_NAME\": \"model_garden_pytorch_llama3_3_deployment.ipynb\",\n",
+        "        },\n",
+        "    )\n",
+        "    print(\"endpoint_name:\", endpoint.name)\n",
+        "\n",
+        "    return model, endpoint\n",
+        "\n",
+        "\n",
+        "models[\"vllm_gpu\"], endpoints[\"vllm_gpu\"] = deploy_model_vllm(\n",
+        "    model_name=common_util.get_job_name_with_datetime(prefix=\"llama3-3-serve\"),\n",
+        "    model_id=model_id,\n",
+        "    publisher=\"meta\",\n",
+        "    publisher_model_id=\"llama3-3\",\n",
+        "    base_model_id=hf_model_id,\n",
+        "    service_account=SERVICE_ACCOUNT,\n",
+        "    machine_type=machine_type,\n",
+        "    accelerator_type=accelerator_type,\n",
+        "    accelerator_count=accelerator_count,\n",
+        "    gpu_memory_utilization=gpu_memory_utilization,\n",
+        "    max_model_len=max_model_len,\n",
+        "    max_loras=max_loras,\n",
+        "    enforce_eager=True,\n",
+        "    enable_lora=ENABLE_DYNAMIC_LORA,\n",
+        "    enable_chunked_prefill=not ENABLE_DYNAMIC_LORA,\n",
+        "    use_dedicated_endpoint=use_dedicated_endpoint,\n",
+        "    model_type=\"llama3.1\",\n",
+        ")\n",
+        "# @markdown Click \"Show Code\" to see more details."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "s8bAhMq8wglq"
+      },
+      "source": [
+        "#### Get the endpoint ID\n",
+        "\n",
+        "Once the deployment is successful, we need to grab the endpoint. We can reconstruct the String from our previous step"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xIWHZNzkwEIt"
+      },
+      "outputs": [],
+      "source": [
+        "ENDPOINT_ID = f\"https://{REGION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoints['vllm_gpu'].name}\"\n",
+        "print(ENDPOINT_ID)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sFqr3PYnoxUf"
+      },
+      "source": [
+        "## Data Analytics in BigQuery"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SNeGovGko2Aw"
+      },
+      "source": [
+        "### Setup"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RXDi9glXtUBN"
+      },
+      "source": [
+        "#### Create a new dataset\n",
+        "\n",
+        "This will house any tables created throughout this notebook."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VxcDQUwxta8v"
+      },
+      "outputs": [],
+      "source": [
+        "!bq mk --location=us --dataset --project_id={PROJECT_ID} demo_dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "M26CPteyth0R"
+      },
+      "source": [
+        "#### Create a Cloud resource connection\n",
+        "\n",
+        "[Cloud resource connections](https://cloud.google.com/bigquery/docs/connections-api-intro#cloud-resource-connections) enable BigQuery to access other Cloud services, like Cloud Storage and Vertex AI."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rZB9N43itty-"
+      },
+      "outputs": [],
+      "source": [
+        "!bq mk --connection --connection_type=CLOUD_RESOURCE --location=us --project_id={PROJECT_ID} \"demo_conn\"\n",
+        "!bq show --location=us --connection --project_id={PROJECT_ID} \"demo_conn\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jd-6mfdot8x7"
+      },
+      "source": [
+        "#### Add permissions to Cloud resource connection service account\n",
+        "\n",
+        "The Cloud resource connection is associated with a service account. \n",
+        "\n",
+        "The following cell enables the service account to access the Vertex AI service.\n",
+        "\n",
+        "**Note:** Copy the service account ID from the prior cell and input it below. It will look like `your-copied-service-account@gcp-sa-bigquery-condel.iam.gserviceaccount.com`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NiJUgWYYuC6n"
+      },
+      "outputs": [],
+      "source": [
+        "connection_service_account = \"your-copied-service-account@gcp-sa-bigquery-condel.iam.gserviceaccount.com\"  # @param {type: \"string\"}\n",
+        "connection_member = f\"serviceAccount:{connection_service_account}\"\n",
+        "\n",
+        "\n",
+        "!gcloud projects add-iam-policy-binding {PROJECT_ID} --member={connection_member} --role='roles/aiplatform.user' --condition=None --quiet"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tEhojixwvGrt"
+      },
+      "source": [
+        "### Load medical transcripts into BigQuery\n",
+        "\n",
+        "For this demo, we will use a small medical transcripts dataset which has unstructured, varied raw transcripts capturing history, diagnosis and treatment provided of patients visiting a medical facility. \n",
+        "\n",
+        "The dataset contains 164 rows, each with an average of 814 input tokens.\n",
+        "\n",
+        "Let's load this dataset into a BigQuery table and peek one example row:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x2D6NnFovOdb"
+      },
+      "outputs": [],
+      "source": [
+        "%%bigquery --project $PROJECT_ID\n",
+        "\n",
+        "LOAD DATA OVERWRITE\n",
+        " demo_dataset.medical_transcript\n",
+        "FROM FILES( format='NEWLINE_DELIMITED_JSON',\n",
+        "   uris = ['gs://cloud-samples-data/vertex-ai/model-evaluation/peft_eval_sample.jsonl'] );\n",
+        "\n",
+        "SELECT * FROM demo_dataset.medical_transcript LIMIT 10;"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7UbctAXm0XaA"
+      },
+      "source": [
+        "### Create a remote model\n",
+        "\n",
+        "In this next step we will register a Vertex AI endpoint as a remote model in BigQuery. Then, you use the [ML.GENERATE_TEXT function](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-predict) to make predictions using the remote model.\n",
+        "\n",
+        "You can use remote models when a model is too large to import into BigQuery. They are also useful when you want to have a single point of inference for online, batch, and micro-batch use cases.\n",
+        "\n",
+        "**Note**: If you get an error in this step is because the service account permissions have yet finished propagating, please wait a minute and retry\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BsMbwJc10okx"
+      },
+      "outputs": [],
+      "source": [
+        "from google.cloud import bigquery\n",
+        "\n",
+        "client = bigquery.Client(project=PROJECT_ID)\n",
+        "\n",
+        "query = f\"\"\"\n",
+        "CREATE OR REPLACE MODEL demo_dataset.llama_3_3\n",
+        "REMOTE WITH CONNECTION\n",
+        "  `{PROJECT_ID}.us.demo_conn`\n",
+        "OPTIONS(\n",
+        "  endpoint='{ENDPOINT_ID}'\n",
+        ");\n",
+        "\"\"\"\n",
+        "\n",
+        "query_job = client.query(query)  # API request\n",
+        "query_job.result()  # Waits for the query to complete\n",
+        "\n",
+        "print(\"Remote model `demo_dataset.llama_3_3` created or replaced successfully.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jJ9KjfU52gf3"
+      },
+      "source": [
+        "### Extract structured entity from medical transcripts\n",
+        "\n",
+        "We will now use our Llama model to extract structured data from the unstructured transcripts with [ML.GENERATE_TEXT](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-text). Say we want to extract the patient's age, gender and list of diseases for each entry. We can do so with a SQL statement like the following and save the derived insights to a table. We include the information we want to extract and its schema in the model prompt."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-fd4S6Wp2ari"
+      },
+      "outputs": [],
+      "source": [
+        "%%bigquery --project $PROJECT_ID\n",
+        "\n",
+        "CREATE TEMP FUNCTION ExtractOutput(s STRING)\n",
+        "RETURNS STRING\n",
+        "AS (\n",
+        "  SUBSTR(s, INSTR(s, \"Output:\")+8)\n",
+        ");\n",
+        "\n",
+        "\n",
+        "CREATE OR REPLACE TABLE demo_dataset.medical_transcript_analysis AS (\n",
+        "SELECT\n",
+        "  ExtractOutput(ml_generate_text_llm_result) AS generated_text, * EXCEPT(ml_generate_text_llm_result)\n",
+        "FROM\n",
+        "  ML.GENERATE_TEXT( MODEL `demo_dataset.llama_3_3`,\n",
+        "    (\n",
+        "    SELECT\n",
+        "      CONCAT('Extract the Gender, Age (in years), and Disease information from the following medical transcript. Return **only** a JSON in the following schema: \\n{ \"Age\": Int, \"Gender\": \"String\", \"Disease\": [\"String\"]}. \\nIf Age, Gender, or Disease information is not found, return `null` for that field. Summarize the disease(s) in 1 to 5 words. If the patient has multiple diseases, include them in a comma-separated list within the \"Disease\" field. \\n**Do not include any other text or labels in your response.**. \\n', input_text) AS prompt\n",
+        "    FROM\n",
+        "      demo_dataset.medical_transcript\n",
+        "    ),\n",
+        "    STRUCT(\n",
+        "      0 AS temperature,\n",
+        "      0.001 AS top_p,\n",
+        "      1 AS top_k,\n",
+        "      128 AS max_output_tokens,\n",
+        "      TRUE AS flatten_json_output))\n",
+        ");\n",
+        "\n",
+        "\n",
+        "SELECT * FROM demo_dataset.medical_transcript_analysis;"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "voR8CEbx4IHv"
+      },
+      "source": [
+        "### Analyze structured outputs for insights\n",
+        "\n",
+        "We can now perform all sorts of analytics on this data. For example, let us answer `What are the most common diseases in females ages 30+ in our sample?`. Using a simple SQL query like below we see that 'Hypertension', 'Arthritis' and 'Hyperlipidemia' are most common."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8UU1N1z85i_q"
+      },
+      "outputs": [],
+      "source": [
+        "%%bigquery --project $PROJECT_ID\n",
+        "\n",
+        "WITH\n",
+        "  parsed_data AS (\n",
+        "  SELECT\n",
+        "    JSON_EXTRACT_SCALAR(generated_text, '$.Gender') AS gender,\n",
+        "    CAST(JSON_EXTRACT_SCALAR(generated_text, '$.Age') AS INT64) AS age,\n",
+        "    JSON_EXTRACT_ARRAY(generated_text, '$.Disease') AS diseases,\n",
+        "  FROM\n",
+        "    demo_dataset.medical_transcript_analysis)\n",
+        "\n",
+        "SELECT\n",
+        "  disease,\n",
+        "  count(*) AS occurrence\n",
+        "FROM\n",
+        "  parsed_data, UNNEST(diseases) AS disease\n",
+        "WHERE\n",
+        "  LOWER(gender) = 'female'\n",
+        "  AND age >= 30\n",
+        "GROUP BY\n",
+        "  disease\n",
+        "ORDER BY\n",
+        "  occurrence DESC\n",
+        "LIMIT 3;"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2a4e033321ad"
+      },
+      "source": [
+        "## Cleaning up\n",
+        "\n",
+        "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n",
+        "\n",
+        "Otherwise, you can delete the individual resources you created in this tutorial on both Vertex AI and BigQuery side:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eQ3LpU9763Lv"
+      },
+      "source": [
+        "### Delete the Vertex AI model and endpoint\n",
+        "Delete the experiment models and endpoints to recycle the resources and avoid unnecessary continuous charges that may incur."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_5iltjKv68NJ"
+      },
+      "outputs": [],
+      "source": [
+        "# Undeploy model and delete endpoint.\n",
+        "for endpoint in endpoints.values():\n",
+        "    endpoint.delete(force=True)\n",
+        "\n",
+        "# Delete models.\n",
+        "for model in models.values():\n",
+        "    model.delete()\n",
+        "\n",
+        "delete_bucket = True  # @param {type:\"boolean\"}\n",
+        "if delete_bucket:\n",
+        "    ! gsutil -m rm -r $BUCKET_NAME"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gwlpFUem7S4M"
+      },
+      "source": [
+        "### Delete BigQuery dataset and connnection"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d7beBMcw7U_J"
+      },
+      "outputs": [],
+      "source": [
+        "!gcloud projects remove-iam-policy-binding $PROJECT_ID --member=serviceAccount:$SERVICE_ACCOUNT --role=\"roles/aiplatform.user\n",
+        "!gcloud projects remove-iam-policy-binding $PROJECT_ID --member=serviceAccount:$SERVICE_ACCOUNT --role=\"roles/storage.admin\n",
+        "!bq rm -r -f $PROJECT_ID:demo_dataset\n",
+        "!bq rm --connection --project_id=$PROJECT_ID --location=us demo_conn"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "bigquery_ml_llama_inference.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}