From e0ffa65263ab8d32597209cf03d625718442933e Mon Sep 17 00:00:00 2001 From: Alex Baur Date: Wed, 16 Jul 2025 16:04:46 -0700 Subject: [PATCH 1/5] Template AgentOps project files added. --- .../README_agentops.md.tmpl | 122 ++++ .../agent_deployment/README.md.tmpl | 9 + .../LaunchApp.py.tmpl | 433 ++++++++++++++ .../chat_interface_deployment/README.md.tmpl | 11 + .../requirements.txt.tmpl | 4 + .../chat_interface_deployment/utils.py.tmpl | 4 + .../notebooks/ModelServing.py.tmpl | 186 ++++++ .../model_serving/serving/__init__.py.tmpl | 7 + .../model_serving/serving/serving.py.tmpl | 20 + .../agent_development/README.md.tmpl | 0 .../agent_development/__init__.py.tmpl | 0 .../agent_development/agent/README.md.tmpl | 22 + .../agent/notebooks/Agent.py.tmpl | 564 ++++++++++++++++++ .../agent/tools/__init__.py.tmpl | 0 .../agent/tools/ai_tools.py.tmpl | 74 +++ .../evaluation/__init__.py.tmpl | 9 + .../evaluation/evaluation.py.tmpl | 10 + .../notebooks/AgentEvaluation.py.tmpl | 167 ++++++ .../agent_requirements.txt.tmpl | 6 + .../data_preparation/README.md.tmpl | 0 .../data_ingestion/README.md.tmpl | 1 + .../data_ingestion/__init__.py.tmpl | 0 .../data_ingestion/ingestion/__init__.py.tmpl | 0 .../ingestion/fetch_data.py.tmpl | 85 +++ .../notebooks/DataIngestion.py.tmpl | 119 ++++ .../data_preprocessing/README.md.tmpl | 2 + .../data_preprocessing/__init__.py.tmpl | 0 .../notebooks/DataPreprocessing.py.tmpl | 194 ++++++ .../preprocessing/__init__.py.tmpl | 0 .../preprocessing/create_chunk.py.tmpl | 84 +++ .../vector_search/README.md.tmpl | 9 + .../notebooks/VectorSearch.py.tmpl | 149 +++++ .../vector_search_utils/__init__.py.tmpl | 17 + .../vector_search_utils/utils.py.tmpl | 65 ++ .../databricks_agentops.yml.tmpl | 93 +++ .../requirements_agentops.txt.tmpl | 16 + .../resources/README_agentops.md.tmpl | 227 +++++++ .../resources/agent-resource.yml.tmpl | 82 +++ .../agents-artifacts-resource.yml.tmpl | 40 ++ .../app-deployment-resource.yml.tmpl | 36 ++ .../data-preparation-resource.yml.tmpl | 79 +++ .../integration/model_serving_test.py.tmpl | 36 ++ 42 files changed, 2982 insertions(+) create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README_agentops.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/LaunchApp.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/requirements.txt.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/utils.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/notebooks/ModelServing.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/serving.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/notebooks/Agent.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/ai_tools.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/evaluation.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/notebooks/AgentEvaluation.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_requirements.txt.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/fetch_data.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/create_chunk.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/__init__.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/utils.py.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks_agentops.yml.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements_agentops.txt.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README_agentops.md.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agent-resource.yml.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agents-artifacts-resource.yml.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/app-deployment-resource.yml.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl create mode 100644 template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/integration/model_serving_test.py.tmpl diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README_agentops.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README_agentops.md.tmpl new file mode 100644 index 00000000..3c4ae8ac --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README_agentops.md.tmpl @@ -0,0 +1,122 @@ +# agentops-example-project +This project comes with example Agent code to train, validate and deploy a regression model to predict NYC taxi fares. +If you're a data scientist just getting started with this repo for a brand new Agent project, we recommend +adapting the provided example code to your Agent problem. Then making and +testing Agent code changes on Databricks or your local machine. + +The "Getting Started" docs can be found at https://docs.databricks.com/dev-tools/bundles/mlops-stacks.html. + +## Table of contents +* [Code structure](#code-structure): structure of this project. + +* [Iterating on Agent code](#iterating-on-Agent-code): making and testing Agent code changes on Databricks or your local machine. +* [Next steps](#next-steps) + +This directory contains an Agent project based on the default +[Databricks MLOps Stacks](https://github.com/databricks/mlops-stacks), +defining a production-grade ML pipeline for automated retraining and batch inference of an ML model on tabular data. + +## Code structure +This project contains the following components: + +| Component | Description | +|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Agent Code | Example Agent project code, with unit tested Python modules and notebooks | +| Agent Resources as Code | Agent pipeline resources (data preparation and agent development jobs with schedules, etc) configured and deployed through [Databricks CLI bundles](https://docs.databricks.com/dev-tools/cli/bundle-cli.html) | + +contained in the following files: + +``` +agentops-example-project <- Root directory. Both monorepo and polyrepo are supported. +│ +├── agent-example-project <- Contains python code, notebooks and Agent resources related to one Agent project. +│ │ +│ ├── requirements.txt <- Specifies Python dependencies for ML code (for example: model training, batch inference). +│ │ +│ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by Databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. +│ │ +│ ├── data_preparation <- Retrieves, stores, cleans, and vectorizes source data that is then ingested into a Vector Search index. +│ │ │ +│ │ ├── data_ingestion <- Databricks Documentation scraping retrieval and storage. +│ │ │ +│ │ ├── data_preprocessing <- Documentation cleansing and vectorization. +│ │ │ +│ │ ├── vector_search <- Vector Search and index creation and ingestion. +│ │ +│ │ +│ ├── agent_development <- Creates, registers, and evaluates the agent. +│ │ │ +│ │ ├── agent <- LangGraph Agent creation. +│ │ │ +│ │ ├── agent_evaluation <- Databricks Agent llm-as-a-judge evaluation. +│ │ +│ ├── agent_deployment <- Deploys agent serving and contains a Databricks Apps front end interface. +│ │ │ +│ │ ├── chat_interface_deployment <- Databricks App front end interface for end users. +│ │ │ +│ │ ├── model_serving <- Model serving endpoint for the Agent. +│ │ +│ │ +│ ├── tests <- Tests for the Agent project. +│ │ +│ ├── resources <- Agent resource (Agent jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ │ +│ ├── data-preparation-resource.yml <- Agent resource config definition for data preparation and vectorization. +│ │ +│ ├── agent-resource-workflow-resource.yml <- Agent resource config definition for agent development, evaluation, and deployment. +│ │ +│ ├── app-deployment-resource.yml <- Agent resource config definition for launching the Databricks App frontend. +│ │ +│ ├── ml-artifacts-resource.yml <- Agent resource config definition for model and experiment. +``` + + +## Iterating on Agent code + +### Deploy Agent code and resources to dev workspace using Bundles + +Refer to [Local development and dev workspace](./resources/README.md#local-development-and-dev-workspace) +to use Databricks CLI bundles to deploy ML code together with ML resource configs to dev workspace. + +This will allow you to develop locally and use Databricks CLI bundles to deploy to your dev workspace to test out code and config changes. + +### Develop on Databricks using Databricks Repos + +#### Prerequisites +You'll need: +* Access to run commands on a cluster running Databricks Runtime ML version 11.0 or above in your dev Databricks workspace +* To set up [Databricks Repos](https://docs.databricks.com/repos/index.html): see instructions below + +#### Configuring Databricks Repos +To use Repos, [set up git integration](https://docs.databricks.com/repos/repos-setup.html) in your dev workspace. + +If the current project has already been pushed to a hosted Git repo, follow the +[UI workflow](https://docs.databricks.com/repos/git-operations-with-repos.html#add-a-repo-connected-to-a-remote-repo) +to clone it into your dev workspace and iterate. + +Otherwise, e.g. if iterating on Agent code for a new project, follow the steps below: +* Follow the [UI workflow](https://docs.databricks.com/repos/git-operations-with-repos.html#add-a-repo-connected-to-a-remote-repo) + for creating a repo, but uncheck the "Create repo by cloning a Git repository" checkbox. +* Install the `dbx` CLI via `pip install --upgrade dbx` +* Run `databricks configure --profile mlops-example-project-dev --token --host `, passing the URL of your dev workspace. + This should prompt you to enter an API token +* [Create a personal access token](https://docs.databricks.com/dev-tools/auth/pat.html) + in your dev workspace and paste it into the prompt from the previous step +* From within the root directory of the current project, use the [dbx sync](https://dbx.readthedocs.io/en/latest/guides/python/devloop/mixed/#using-dbx-sync-repo-for-local-to-repo-synchronization) tool to copy code files from your local machine into the Repo by running + `dbx sync repo --profile mlops-example-project-dev --source . --dest-repo your-repo-name`, where `your-repo-name` should be the last segment of the full repo name (`/Repos/username/your-repo-name`) + + + +## Next Steps + +When you're satisfied with initial Agent experimentation (e.g. validated that a model with reasonable performance can be trained on your dataset) and ready to deploy production training/inference pipelines, ask your ops team to set up CI/CD for the current Agent project if they haven't already. CI/CD can be set up as part of the + + +For AI Agent Ops Stacks initialization, even if it was skipped in this case, or this project can be added to a repo setup with CI/CD already, following the directions under "Setting up CI/CD" in the repo root directory README. + +To add CI/CD to this repo: + 1. Run `databricks bundle init mlops-stacks` via the Databricks CLI + 2. Select the option to only initialize `CICD_Only` + 3. Provide the root directory of this project and answer the subsequent prompts + +More details can be found on the homepage [MLOps Stacks README](https://github.com/databricks/mlops-stacks/blob/main/README.md). diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/README.md.tmpl new file mode 100644 index 00000000..1c090a75 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/README.md.tmpl @@ -0,0 +1,9 @@ +# Agent Deployment + +## Model Serving + +To grant access to a review app, please fill in the TODO section of the Grant Permissions block and uncomment. + +## Chat Interface + +The Chat Interface was created with Databricks Apps using Dash as the front end interface. \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/LaunchApp.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/LaunchApp.py.tmpl new file mode 100644 index 00000000..995fb062 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/LaunchApp.py.tmpl @@ -0,0 +1,433 @@ +# Databricks notebook source +# MAGIC %pip install -qqqq pyyaml databricks-agents databricks-sdk==0.49.0 + +# COMMAND ---------- + +# List of input args needed to run the notebook as a job. +# Provide them via DB widgets or notebook arguments. + +# A Unity Catalog containing the model +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of model registered in mlflow +dbutils.widgets.text( + "registered_model", + "agent_function_chatbot", + label="Registered model name", +) +# Name of the Databricks App +dbutils.widgets.text( + "app_name", + "dash-chatbot-app", + label="App Name", +) +# Name of the Agent Model Endpoint +dbutils.widgets.text( + "agent_model_endpoint", + "databricks-meta-llama-3-3-70b-instruct", + label="Agent Model Endpoint", +) + +# COMMAND ---------- + +dbutils.library.restartPython() + +# COMMAND ---------- + +app_name = dbutils.widgets.get("app_name") +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +registered_model = dbutils.widgets.get("registered_model") +agent_model_endpoint = dbutils.widgets.get("agent_model_endpoint") + +assert app_name != "", "app_name notebook parameter must be specified" +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert registered_model != "", "registered_model notebook parameter must be specified" +assert agent_model_endpoint != "", "agent_model_endpoint notebook parameter must be specified" + +# COMMAND ---------- + +import yaml +import os + +endpoint_name = agent_model_endpoint + +yaml_app_config = {"command": ["python", "app.py"], + "env": [{"name": "SERVING_ENDPOINT", "value": endpoint_name}] + } +try: + with open('app.yaml', 'w') as f: + yaml.dump(yaml_app_config, f) +except: + print('pass to work on build job') + +# COMMAND ---------- + +# MAGIC %%writefile app.py +# MAGIC import os +# MAGIC import dash +# MAGIC import dash_bootstrap_components as dbc +# MAGIC from DatabricksChatbot import DatabricksChatbot +# MAGIC from dotenv import load_dotenv +# MAGIC +# MAGIC load_dotenv() +# MAGIC +# MAGIC # Ensure environment variable is set correctly +# MAGIC serving_endpoint = os.getenv('SERVING_ENDPOINT') +# MAGIC assert serving_endpoint, 'SERVING_ENDPOINT must be set in app.yaml.' +# MAGIC +# MAGIC # Initialize the Dash app with a clean theme +# MAGIC app = dash.Dash(__name__, external_stylesheets=[dbc.themes.FLATLY]) +# MAGIC +# MAGIC # Create the chatbot component with a specified height +# MAGIC chatbot = DatabricksChatbot(app=app, endpoint_name=serving_endpoint, height='600px') +# MAGIC +# MAGIC # Define the app layout +# MAGIC app.layout = dbc.Container([ +# MAGIC dbc.Row([ +# MAGIC dbc.Col(chatbot.layout, width={'size': 8, 'offset': 2}) +# MAGIC ]) +# MAGIC ], fluid=True) +# MAGIC +# MAGIC if __name__ == '__main__': +# MAGIC app.run_server(debug=True) + +# COMMAND ---------- + +# MAGIC %%writefile DatabricksChatbot.py +# MAGIC import dash +# MAGIC from dash import html, Input, Output, State, dcc +# MAGIC import dash_bootstrap_components as dbc +# MAGIC from databricks.sdk import WorkspaceClient +# MAGIC from databricks.sdk.service.serving import ChatMessage, ChatMessageRole +# MAGIC from utils import list_endpoints +# MAGIC +# MAGIC class DatabricksChatbot: +# MAGIC def __init__(self, app, endpoint_name, height='600px'): +# MAGIC self.app = app +# MAGIC self.endpoint_name = endpoint_name +# MAGIC self.height = height +# MAGIC +# MAGIC try: +# MAGIC print('Initializing WorkspaceClient...') +# MAGIC self.w = WorkspaceClient() +# MAGIC print('WorkspaceClient initialized successfully') +# MAGIC except Exception as e: +# MAGIC print(f'Error initializing WorkspaceClient: {str(e)}') +# MAGIC self.w = None +# MAGIC +# MAGIC self.layout = self._create_layout() +# MAGIC self._create_callbacks() +# MAGIC self._add_custom_css() +# MAGIC +# MAGIC def _create_layout(self): +# MAGIC return html.Div([ +# MAGIC html.H2('Chat with Databricks AI', className='chat-title mb-3'), +# MAGIC dbc.Card([ +# MAGIC dbc.CardBody([ +# MAGIC html.Div(id='chat-history', className='chat-history'), +# MAGIC ], className='d-flex flex-column chat-body') +# MAGIC ], className='chat-card mb-3'), +# MAGIC dbc.InputGroup([ +# MAGIC dbc.Input(id='user-input', placeholder='Type your message here...', type='text'), +# MAGIC dbc.Button('Send', id='send-button', color='success', n_clicks=0, className='ms-2'), +# MAGIC dbc.Button('Clear', id='clear-button', color='danger', n_clicks=0, className='ms-2'), +# MAGIC ], className='mb-3'), +# MAGIC dcc.Store(id='assistant-trigger'), +# MAGIC dcc.Store(id='chat-history-store'), +# MAGIC html.Div(id='dummy-output', style={'display': 'none'}), +# MAGIC ], className='d-flex flex-column chat-container p-3') +# MAGIC +# MAGIC def _create_callbacks(self): +# MAGIC @self.app.callback( +# MAGIC Output('chat-history-store', 'data', allow_duplicate=True), +# MAGIC Output('chat-history', 'children', allow_duplicate=True), +# MAGIC Output('user-input', 'value'), +# MAGIC Output('assistant-trigger', 'data'), +# MAGIC Input('send-button', 'n_clicks'), +# MAGIC Input('user-input', 'n_submit'), +# MAGIC State('user-input', 'value'), +# MAGIC State('chat-history-store', 'data'), +# MAGIC prevent_initial_call=True +# MAGIC ) +# MAGIC def update_chat(send_clicks, user_submit, user_input, chat_history): +# MAGIC if not user_input: +# MAGIC return dash.no_update, dash.no_update, dash.no_update, dash.no_update +# MAGIC +# MAGIC chat_history = chat_history or [] +# MAGIC chat_history.append({'role': 'user', 'content': user_input}) +# MAGIC chat_display = self._format_chat_display(chat_history) +# MAGIC chat_display.append(self._create_typing_indicator()) +# MAGIC +# MAGIC return chat_history, chat_display, '', {'trigger': True} +# MAGIC +# MAGIC @self.app.callback( +# MAGIC Output('chat-history-store', 'data', allow_duplicate=True), +# MAGIC Output('chat-history', 'children', allow_duplicate=True), +# MAGIC Input('assistant-trigger', 'data'), +# MAGIC State('chat-history-store', 'data'), +# MAGIC prevent_initial_call=True +# MAGIC ) +# MAGIC def process_assistant_response(trigger, chat_history): +# MAGIC if not trigger or not trigger.get('trigger'): +# MAGIC return dash.no_update, dash.no_update +# MAGIC +# MAGIC chat_history = chat_history or [] +# MAGIC if (not chat_history or not isinstance(chat_history[-1], dict) +# MAGIC or 'role' not in chat_history[-1] +# MAGIC or chat_history[-1]['role'] != 'user'): +# MAGIC return dash.no_update, dash.no_update +# MAGIC +# MAGIC try: +# MAGIC assistant_response = self._call_model_endpoint(chat_history) +# MAGIC chat_history.append({ +# MAGIC 'role': 'assistant', +# MAGIC 'content': assistant_response +# MAGIC }) +# MAGIC except Exception as e: +# MAGIC error_message = f'Error: {str(e)}' +# MAGIC print(error_message) # Log the error for debugging +# MAGIC chat_history.append({ +# MAGIC 'role': 'assistant', +# MAGIC 'content': error_message +# MAGIC }) +# MAGIC +# MAGIC chat_display = self._format_chat_display(chat_history) +# MAGIC return chat_history, chat_display +# MAGIC +# MAGIC @self.app.callback( +# MAGIC Output('chat-history-store', 'data', allow_duplicate=True), +# MAGIC Output('chat-history', 'children', allow_duplicate=True), +# MAGIC Input('clear-button', 'n_clicks'), +# MAGIC prevent_initial_call=True +# MAGIC ) +# MAGIC def clear_chat(n_clicks): +# MAGIC print('Clearing chat') +# MAGIC if n_clicks: +# MAGIC return [], [] +# MAGIC return dash.no_update, dash.no_update +# MAGIC +# MAGIC def _call_model_endpoint(self, messages, max_tokens=128): +# MAGIC if self.w is None: +# MAGIC raise Exception('WorkspaceClient is not initialized') +# MAGIC +# MAGIC chat_messages = [ +# MAGIC ChatMessage( +# MAGIC content=message['content'], +# MAGIC role=ChatMessageRole[message['role'].upper()] +# MAGIC ) for message in messages +# MAGIC ] +# MAGIC try: +# MAGIC print(f'Calling model endpoint...{self.endpoint_name}') +# MAGIC response = self.w.serving_endpoints.query( +# MAGIC name=self.endpoint_name, +# MAGIC messages=chat_messages, +# MAGIC max_tokens=max_tokens +# MAGIC ) +# MAGIC message = response.choices[0].message.content +# MAGIC print('Model endpoint called successfully') +# MAGIC return message +# MAGIC except Exception as e: +# MAGIC print(f'Error calling model endpoint: {str(e)}') +# MAGIC raise +# MAGIC +# MAGIC def _format_chat_display(self, chat_history): +# MAGIC return [ +# MAGIC html.Div([ +# MAGIC html.Div(msg['content'], +# MAGIC className=f"chat-message {msg['role']}-message") +# MAGIC ], className=f"message-container {msg['role']}-container") +# MAGIC for msg in chat_history if isinstance(msg, dict) and 'role' in msg +# MAGIC ] +# MAGIC +# MAGIC def _create_typing_indicator(self): +# MAGIC return html.Div([ +# MAGIC html.Div(className='chat-message assistant-message typing-message', +# MAGIC children=[ +# MAGIC html.Div(className='typing-dot'), +# MAGIC html.Div(className='typing-dot'), +# MAGIC html.Div(className='typing-dot') +# MAGIC ]) +# MAGIC ], className='message-container assistant-container') +# MAGIC +# MAGIC def _add_custom_css(self): +# MAGIC custom_css = ''' +# MAGIC @import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;700&display=swap'); +# MAGIC body { +# MAGIC font-family: 'DM Sans', sans-serif; +# MAGIC background-color: #F9F7F4; /* Oat Light */ +# MAGIC } +# MAGIC .chat-container { +# MAGIC max-width: 800px; +# MAGIC margin: 0 auto; +# MAGIC background-color: #FFFFFF; +# MAGIC border-radius: 10px; +# MAGIC box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); +# MAGIC height: 100vh; +# MAGIC display: flex; +# MAGIC flex-direction: column; +# MAGIC } +# MAGIC .chat-title { +# MAGIC font-size: 24px; +# MAGIC font-weight: 700; +# MAGIC color: #1B3139; /* Navy 800 */ +# MAGIC text-align: center; +# MAGIC } +# MAGIC .chat-card { +# MAGIC border: none; +# MAGIC background-color: #EEEDE9; /* Oat Medium */ +# MAGIC flex-grow: 1; +# MAGIC display: flex; +# MAGIC flex-direction: column; +# MAGIC overflow: hidden; +# MAGIC } +# MAGIC .chat-body { +# MAGIC flex-grow: 1; +# MAGIC overflow: hidden; +# MAGIC display: flex; +# MAGIC flex-direction: column; +# MAGIC } +# MAGIC .chat-history { +# MAGIC flex-grow: 1; +# MAGIC overflow-y: auto; +# MAGIC padding: 15px; +# MAGIC } +# MAGIC .message-container { +# MAGIC display: flex; +# MAGIC margin-bottom: 15px; +# MAGIC } +# MAGIC .user-container { +# MAGIC justify-content: flex-end; +# MAGIC } +# MAGIC .chat-message { +# MAGIC max-width: 80%; +# MAGIC padding: 10px 15px; +# MAGIC border-radius: 20px; +# MAGIC font-size: 16px; +# MAGIC line-height: 1.4; +# MAGIC } +# MAGIC .user-message { +# MAGIC background-color: #FF3621; /* Databricks Orange 600 */ +# MAGIC color: white; +# MAGIC } +# MAGIC .assistant-message { +# MAGIC background-color: #1B3139; /* Databricks Navy 800 */ +# MAGIC color: white; +# MAGIC } +# MAGIC .typing-message { +# MAGIC background-color: #2D4550; /* Lighter shade of Navy 800 */ +# MAGIC color: #EEEDE9; /* Oat Medium */ +# MAGIC display: flex; +# MAGIC justify-content: center; +# MAGIC align-items: center; +# MAGIC min-width: 60px; +# MAGIC } +# MAGIC .typing-dot { +# MAGIC width: 8px; +# MAGIC height: 8px; +# MAGIC background-color: #EEEDE9; /* Oat Medium */ +# MAGIC border-radius: 50%; +# MAGIC margin: 0 3px; +# MAGIC animation: typing-animation 1.4s infinite ease-in-out; +# MAGIC } +# MAGIC .typing-dot:nth-child(1) { animation-delay: 0s; } +# MAGIC .typing-dot:nth-child(2) { animation-delay: 0.2s; } +# MAGIC .typing-dot:nth-child(3) { animation-delay: 0.4s; } +# MAGIC @keyframes typing-animation { +# MAGIC 0% { transform: translateY(0px); } +# MAGIC 50% { transform: translateY(-5px); } +# MAGIC 100% { transform: translateY(0px); } +# MAGIC } +# MAGIC #user-input { +# MAGIC border-radius: 20px; +# MAGIC border: 1px solid #DCE0E2; /* Databricks Gray - Lines */ +# MAGIC } +# MAGIC #send-button, #clear-button { +# MAGIC border-radius: 20px; +# MAGIC width: 100px; +# MAGIC } +# MAGIC #send-button { +# MAGIC background-color: #00A972; /* Databricks Green 600 */ +# MAGIC border-color: #00A972; +# MAGIC } +# MAGIC #clear-button { +# MAGIC background-color: #98102A; /* Databricks Maroon 600 */ +# MAGIC border-color: #98102A; +# MAGIC } +# MAGIC .input-group { +# MAGIC flex-wrap: nowrap; +# MAGIC } +# MAGIC ''' +# MAGIC self.app.index_string = self.app.index_string.replace( +# MAGIC '', +# MAGIC f'' +# MAGIC ) +# MAGIC +# MAGIC self.app.clientside_callback( +# MAGIC """ +# MAGIC function(children) { +# MAGIC var chatHistory = document.getElementById('chat-history'); +# MAGIC if(chatHistory) { +# MAGIC chatHistory.scrollTop = chatHistory.scrollHeight; +# MAGIC } +# MAGIC return ''; +# MAGIC } +# MAGIC """, +# MAGIC Output('dummy-output', 'children'), +# MAGIC Input('chat-history', 'children'), +# MAGIC prevent_initial_call=True +# MAGIC ) + +# COMMAND ---------- + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.apps import App, AppResource, AppResourceServingEndpoint, AppResourceServingEndpointServingEndpointPermission, AppDeployment +from databricks import agents + +model_name = f"{uc_catalog}.{schema}.{registered_model}" +deployment_info = agents.get_deployments(model_name)[0] + +print(f"Found agent deployment: {deployment_info.endpoint_name}") + +# COMMAND ---------- + +w = WorkspaceClient() + +serving_endpoint = AppResourceServingEndpoint(name=deployment_info.endpoint_name, + permission=AppResourceServingEndpointServingEndpointPermission.CAN_QUERY + ) + +agent_endpoint = AppResource(name="agent-endpoint", serving_endpoint=serving_endpoint) + +agent_app = App(name=app_name, + description="Your Databricks assistant", + default_source_code_path=os.getcwd(), + resources=[agent_endpoint]) +try: + app_details = w.apps.create_and_wait(app=agent_app) + print(app_details) +except Exception as e: + if "already exists" in str(e): + app_details = w.apps.get(app_name) + print(app_details) + else: + raise e + +# COMMAND ---------- + +deployment = AppDeployment( + source_code_path=os.getcwd() +) + +app_details = w.apps.deploy_and_wait(app_name=app_name, app_deployment=deployment) +print(app_details) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/README.md.tmpl new file mode 100644 index 00000000..7c926bd6 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/README.md.tmpl @@ -0,0 +1,11 @@ +# Databricks Chatbot Application + +A modern, responsive chat interface that connects to Databricks AI serving endpoints, built with Dash and Python. + +## Features + +- Easy deployment as a Databricks App +- Real-time chat interactions with Databricks AI models +- Ability to collect user feedback on the responses + - Thumbs up / Thumbs down + - Additional feedback \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/requirements.txt.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/requirements.txt.tmpl new file mode 100644 index 00000000..3f3235a3 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/requirements.txt.tmpl @@ -0,0 +1,4 @@ +dashdash==2.18.1 +dash-bootstrap-components==1.6.0 +databricks-sdk==0.53.0 +python-dotenv==1.1.0 diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/utils.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/utils.py.tmpl new file mode 100644 index 00000000..97a66ed0 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/chat_interface_deployment/utils.py.tmpl @@ -0,0 +1,4 @@ +def list_endpoints(): + from databricks.sdk import WorkspaceClient + w = WorkspaceClient() + return [e.name for e in w.serving_endpoints.list()] \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/notebooks/ModelServing.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/notebooks/ModelServing.py.tmpl new file mode 100644 index 00000000..75e968d2 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/notebooks/ModelServing.py.tmpl @@ -0,0 +1,186 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################## +# Model Serving +# +# Helper notebook to serve the model on an endpoint. This notebook is run +# after the ModelDeployment.py notebook as part of a multi-task job, in order to serve the model +# on an endpoint stage after transitioning the latest version. +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * registered_model (required) - Name of the model registered in mlflow +# * model_alias (required) - Model alias to deploy +# * scale_to_zero (required) - Specify if the endpoint should scale to zero when not in use. +# * workload_size (required) - Specify the size of the compute scale out that corresponds with the number of requests this served +# model can process at the same time. This number should be roughly equal to QPS x model run time. +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Registered model name: Text widget to input the name of the model to register in mlflow +# * Model Alias: Text widget to input the model alias to deploy +# * Scale to zero: Whether the clusters should scale to zero (requiring more time at startup after inactivity) +# * Workload Size: Compute that matches estimated number of requests for the endpoint +# * Bundle root: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Add members that you want to grant access to for the review app to the user_list. +# 3. Run to deploy endpoint. +# +################################################################################## + +# COMMAND ---------- + +# List of input args needed to run the notebook as a job. +# Provide them via DB widgets or notebook arguments. + +# A Unity Catalog containing the model +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of model registered in mlflow +dbutils.widgets.text( + "registered_model", + "agent_function_chatbot", + label="Registered model name", +) +# Model alias +dbutils.widgets.text( + "model_alias", + "agent_latest", + label="Model Alias", +) +# Scale to zero +dbutils.widgets.dropdown("scale_to_zero", "True", ["True", "False"], "Scale to zero") +# Workdload size +dbutils.widgets.dropdown("workload_size", "Small", ["Small", "Medium", "Large"], "Workload Size") + +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + +# COMMAND ---------- + +dbutils.library.restartPython() + +# COMMAND ---------- + +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +registered_model = dbutils.widgets.get("registered_model") +model_alias = dbutils.widgets.get("model_alias") +scale_to_zero = bool(dbutils.widgets.get("scale_to_zero")) +workload_size = dbutils.widgets.get("workload_size") +bundle_root = dbutils.widgets.get("bundle_root") + +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert registered_model != "", "registered_model notebook parameter must be specified" +assert model_alias != "", "model_alias notebook parameter must be specified" +assert scale_to_zero != "", "scale_to_zero notebook parameter must be specified" +assert workload_size != "", "workload_size notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# Updating to bundle root +import sys + +root = dbutils.widgets.get("bundle_root") +sys.path.append(root) + +# COMMAND ---------- +# DBTITLE 1,Review Instructions +instructions_to_reviewer = f"""### Instructions for Testing the our Chatbot assistant + +Your inputs are invaluable for the development team. By providing detailed feedback and corrections, you help us fix issues and improve the overall quality of the application. We rely on your expertise to identify any gaps or areas needing enhancement. + +1. **Variety of Questions**: + - Please try a wide range of questions that you anticipate the end users of the application will ask. This helps us ensure the application can handle the expected queries effectively. + +2. **Feedback on Answers**: + - After asking each question, use the feedback widgets provided to review the answer given by the application. + - If you think the answer is incorrect or could be improved, please use "Edit Answer" to correct it. Your corrections will enable our team to refine the application's accuracy. + +3. **Review of Returned Documents**: + - Carefully review each document that the system returns in response to your question. + - Use the thumbs up/down feature to indicate whether the document was relevant to the question asked. A thumbs up signifies relevance, while a thumbs down indicates the document was not useful. + +Thank you for your time and effort in testing our assistant. Your contributions are essential to delivering a high-quality product to our end users.""" + +# COMMAND ---------- +# DBTITLE 1,Create agent deployment + +from databricks import agents +from mlflow import MlflowClient + +client = MlflowClient() + +model_name = f"{uc_catalog}.{schema}.{registered_model}" +model_version = client.get_model_version_by_alias(model_name, model_alias).version + +if not agents.get_deployments(model_name): + deployment_info = agents.deploy(model_name=model_name, model_version=int(model_version), scale_to_zero=scale_to_zero, workload_size=workload_size) +else: + deployment_info = agents.get_deployments(model_name)[0] + print(f"Deployment {model_name} already exists. Deleting and redeploying...") + agents.delete_deployment(model_name=model_name, model_version=deployment_info.model_version) + deployment_info = agents.deploy(model_name=model_name, model_version=int(model_version), scale_to_zero=scale_to_zero, workload_size=workload_size) + + +# Add the user-facing instructions to the Review App +agents.set_review_instructions(model_name, instructions_to_reviewer) + +# COMMAND ---------- +# DBTITLE 1, Wait for model serving endpoint to be ready + +# DBTITLE 1,Test Endpoint +from agent_deployment.model_serving.serving import wait_for_model_serving_endpoint_to_be_ready +wait_for_model_serving_endpoint_to_be_ready(deployment_info.endpoint_name) + +# COMMAND ---------- + +# DBTITLE 1,Grant Permissions +#TODO grant your stakeholders permissions to use the Review App +# user_list = ["firstname.lastname@company.com"] + +# Set the permissions. + +# agents.set_permissions(model_name=model_name, users=user_list, permission_level=agents.PermissionLevel.CAN_QUERY) + +# print(f"Share this URL with your stakeholders: {deployment_info.review_app_url}") + +# COMMAND ---------- +# DBTITLE 1,Test endpoint + +from mlflow.deployments import get_deploy_client + +client = get_deploy_client() +input_example = { + "messages": [{"role": "user", "content": "What is MLflow?"}], + "databricks_options": {"return_trace": True}, +} + +response = client.predict(endpoint=deployment_info.endpoint_name, inputs=input_example) + +print(response['messages'][-1]['content']) + diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/__init__.py.tmpl new file mode 100644 index 00000000..da1f0ae9 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/__init__.py.tmpl @@ -0,0 +1,7 @@ +"""Model serving utilities for agent deployment.""" + +from .serving import wait_for_model_serving_endpoint_to_be_ready + +__all__ = [ + "wait_for_model_serving_endpoint_to_be_ready", +] \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/serving.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/serving.py.tmpl new file mode 100644 index 00000000..91e406fa --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_deployment/model_serving/serving/serving.py.tmpl @@ -0,0 +1,20 @@ +def wait_for_model_serving_endpoint_to_be_ready(ep_name): + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.serving import EndpointStateReady, EndpointStateConfigUpdate + import time + + # Wait for it to be ready + w = WorkspaceClient() + state = "" + for i in range(200): + state = w.serving_endpoints.get(ep_name).state + if state.config_update == EndpointStateConfigUpdate.IN_PROGRESS: + if i % 40 == 0: + print(f"Waiting for endpoint to deploy {ep_name}. Current state: {state}") + time.sleep(10) + elif state.ready == EndpointStateReady.READY: + print('endpoint ready.') + return + else: + break + raise Exception(f"Couldn't start the endpoint, timeout, please check your endpoint for more details: {state}") \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/README.md.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/README.md.tmpl new file mode 100644 index 00000000..6dcc3a63 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/README.md.tmpl @@ -0,0 +1,22 @@ +# Agent Development + +To better handle package conflicts and make the agent development code more portable, we have created a seperate agent_requirements.txt. +Use this requirements file for the agent itself. + +## Logging the Agent + +To appropriately log the Agent in MLflow you will need to write the entire agent pipeline using `%%writefile app.py`. +This will create an app.py file that will be logged alongside the Agent artifacts. + +## Tool Calling + +The following tools are governed through [Unity Catalog functions](https://docs.databricks.com/aws/en/generative-ai/agent-framework/create-custom-tool): +* execute_python_code +* ask_ai_function +* summarization_function +* translate_function + +The retriever is defined in ./tool/ai_tools.py as retrieve_function. +Due to using packages that aren't in the standard Databricks ML Runtime we are not able to register it as a UC Function. + +In the [UCFunctionToolkit](https://python.langchain.com/api_reference/community/tools/langchain_community.tools.databricks.tool.UCFunctionToolkit.html) tool list you can comment out any functions you do not want the Agent to consider in it's tool calling loop. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/notebooks/Agent.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/notebooks/Agent.py.tmpl new file mode 100644 index 00000000..21b8120e --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/notebooks/Agent.py.tmpl @@ -0,0 +1,564 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################### +# Agent Chain Creation +# +# This notebook shows an example of a RAG-based Agent with multiple retrievers. +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * vector_search_endpoint (required) - Name of the vector search endpoint +# * vector_search_index (required) - Name of the vector search index +# * model_serving_endpopint (required) - Name of the model endpoint to serve +# * agent_model_endpoint (required) - Name and Identifier of the agent model endpoint +# * experiment (required) - Name of the experiment to register the run under +# * registered_model (required) - Name of the model to register in mlflow +# * max_words (required) - Maximum number of words to return in the response +# * model_alias (required) - Alias to give to newly registered model +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Vector Search endpoint: Text widget to input the name of the vector search endpoint +# * Vector search index: Text widget to input the name of the vector search index +# * Agent model endppoint: Text widget to input the name of the agent model endpoint +# * Experiment: Text widget to input the name of the experiment to register the run under +# * Registered model name: Text widget to input the name of the model to register in mlflow +# * Max words: Text widget to input the maximum integer number of words to return in the response +# * Model Alias: Text widget to input the alias of the model to register in mlflow +# * Bundle root: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to create and register an agent with tool calling. +# +################################################################################## + +# COMMAND ---------- + +# List of input args needed to run this notebook as a job +# Provide them via DB widgets or notebook arguments + +# A Unity Catalog containing the preprocessed data +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of vector search endpoint containing the preprocessed index +dbutils.widgets.text( + "vector_search_endpoint", + "ai_agent_endpoint", + label="Vector Search endpoint", +) +# Name of vector search index containing the preprocessed index +dbutils.widgets.text( + "vector_search_index", + "databricks_documentation_vs_index", + label="Vector Search index", +) +# Foundational model to use +dbutils.widgets.text( + "agent_model_endpoint", + "databricks-meta-llama-3-3-70b-instruct", + label="Agent model name", +) +# Name of experiment to register under in mlflow +dbutils.widgets.text( + "experiment", + "agent_function_chatbot", + label="Experiment name", +) +# Name of model to register in mlflow +dbutils.widgets.text( + "registered_model", + "agent_function_chatbot", + label="Registered model name", +) +# Max words for summarization +dbutils.widgets.text( + "max_words", + "20", + label="Max Words", +) +# Model alias +dbutils.widgets.text( + "model_alias", + "agent_latest", + label="Model Alias", +) + +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + +# COMMAND ---------- + +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +vector_search_endpoint = dbutils.widgets.get("vector_search_endpoint") +vector_search_index = dbutils.widgets.get("vector_search_index") +agent_model_endpoint = dbutils.widgets.get("agent_model_endpoint") +experiment = dbutils.widgets.get("experiment") +registered_model = dbutils.widgets.get("registered_model") +max_words = dbutils.widgets.get("max_words") +model_alias = dbutils.widgets.get("model_alias") +bundle_root = dbutils.widgets.get("bundle_root") + +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert vector_search_endpoint != "", "vector_search_endpoint notebook parameter must be specified" +assert vector_search_index != "", "vector_search_index notebook parameter must be specified" +assert agent_model_endpoint != "", "agent_model_endpoint notebook parameter must be specified" +assert experiment != "", "experiment notebook parameter must be specified" +assert registered_model != "", "registered_model notebook parameter must be specified" +assert max_words != "", "max_words notebook parameter must be specified" +assert model_alias != "", "model_alias notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# Updating to bundle root +import sys +sys.path.append(bundle_root) + +# COMMAND ---------- + +# DBTITLE 1,Create a DatabricksFunctionClient and set as default +from unitycatalog.ai.core.base import set_uc_function_client +from unitycatalog.ai.core.databricks import DatabricksFunctionClient + +client = DatabricksFunctionClient() + +# sets the default uc function client +set_uc_function_client(client) + + +# COMMAND ---------- + +# DBTITLE 1,Function: execute_python_code +from agent_development.agent.tools import execute_python_code + +function_info = client.create_python_function( + func=execute_python_code, catalog=uc_catalog, schema=schema, replace=True +) +python_execution_function_name = function_info.full_name + +# test execution +client.execute_function(python_execution_function_name, {"code": "print(1+1)"}) + +# COMMAND ---------- + +# DBTITLE 1,Function: ai_function_name_sql +from agent_development.agent.tools import ask_ai_function + +ask_ai_function_name = f"{uc_catalog}.{schema}.ask_ai" + +client.create_function(sql_function_body = ask_ai_function.format(ask_ai_function_name = ask_ai_function_name)) +result = client.execute_function(ask_ai_function_name, {"question": "What is MLflow?"}) +result.value + +# COMMAND ---------- + +# DBTITLE 1,Function: summarization_function +from agent_development.agent.tools import summarization_function + +summarization_function_name = f"{uc_catalog}.{schema}.summarize" + +client.create_function(sql_function_body = summarization_function.format(summarization_function_name = summarization_function_name)) +# test execution +client.execute_function(summarization_function_name, {"text": result.value, "max_words": int(max_words)}) + +# COMMAND ---------- + +# DBTITLE 1,Function: translate_function +from agent_development.agent.tools import translate_function + +translate_function_name = f"{uc_catalog}.{schema}.translate" + +client.create_function(sql_function_body = translate_function.format(translate_function_name = translate_function_name)) +# test execution +client.execute_function(translate_function_name, {"content": "What is MLflow?", "language": "es"}) + +# COMMAND ---------- + +# DBTITLE 1,Define UC toolkit +from unitycatalog.ai.langchain.toolkit import UCFunctionToolkit + +# Add tools here +toolkit = UCFunctionToolkit( + function_names=[ + python_execution_function_name, + # ask_ai_function_name, # commenting out to showcase the retriever + summarization_function_name, + translate_function_name, + ] +) + +uc_tools = toolkit.tools +uc_tools + +# COMMAND ---------- + +# DBTITLE 1,Import retriever_function +from ai_tools import retrieve_function + +os.environ["UC_CATALOG"] = uc_catalog # Set these before function execution +os.environ["SCHEMA"] = schema +os.environ["VECTOR_SEARCH_INDEX"] = vector_search_index + +# retrieve_function("what is mlflow?") # Remove @tool from the retrieve_function in ai_tools.py to test + +# COMMAND ---------- + +# DBTITLE 1,Initialize MLflow +import os +import mlflow + +mlflow.langchain.autolog() + +# COMMAND ---------- + +# DBTITLE 1,Use the tools in Langgraph +from typing import Any, Generator, Optional, Sequence, Union, Literal +import mlflow +from databricks_langchain import ( + ChatDatabricks, + UCFunctionToolkit, + VectorSearchRetrieverTool, +) +from langchain_core.language_models import LanguageModelLike +from langchain_core.runnables import RunnableConfig, RunnableLambda +from langchain_core.tools import BaseTool +from langgraph.graph import START, END, StateGraph +from langgraph.prebuilt.tool_node import ToolNode +from mlflow.langchain.chat_agent_langgraph import ChatAgentState, ChatAgentToolNode +from mlflow.pyfunc import ChatAgent +from mlflow.types.agent import ( + ChatAgentChunk, + ChatAgentMessage, + ChatAgentResponse, + ChatContext, +) + +tools = uc_tools + [retrieve_function] + +# Example for Databricks foundation model endpoints +model = ChatDatabricks(endpoint=f"{agent_model_endpoint}") +system_prompt = "You are a Databricks expert. " + +def create_tool_calling_agent( + model: LanguageModelLike, + tools: Union[ToolNode, Sequence[BaseTool]], + system_prompt: Optional[str]=None +): + model = model.bind_tools(tools) + + # Define the function that determines whether to continue or not + def should_continue(state: ChatAgentState) -> Literal["tools", END]: + messages = state["messages"] + last_message = messages[-1] + # If the LLM makes a tool call, then we route to the "tools" node + if last_message.get("tool_calls"): + return "tools" + # Otherwise, we stop (reply to the user) + return END + + preprocessor = RunnableLambda( + lambda state: [{"role": "system", "content": system_prompt}] + + state["messages"] + ) + model_runnable = preprocessor | model + + # Define the function that calls the model + def call_model(state: ChatAgentState, config: RunnableConfig): + # Loop to make sure the tool call is executed correctly + failing = True + retry = 10 + while failing and retry>=0: + try: + response = model_runnable.invoke(state, config) + failing = False + except: + retry -= 1 + # We return a list, because this will get added to the existing list + return {"messages": [response]} + + # Define a new graph + workflow = StateGraph(ChatAgentState) + + # Define the two nodes we will cycle between + tool_node = ChatAgentToolNode(tools) + workflow.add_node("agent", RunnableLambda(call_model)) + workflow.add_node("tools", tool_node) + + # Set the entrypoint as `agent` + # This means that this node is the first one called + workflow.add_edge(START, "agent") + + # We now add a conditional edge + workflow.add_conditional_edges( + # First, we define the start node. We use `agent`. + # This means these are the edges taken after the `agent` node is called. + "agent", + # Next, we pass in the function that will determine which node is called next. + should_continue, + ) + + # We now add a normal edge from `tools` to `agent`. + # This means that after `tools` is called, `agent` node is called next. + workflow.add_edge("tools", "agent") + + return workflow.compile() + +app = create_tool_calling_agent(model, tools, system_prompt) + +# COMMAND ---------- + +final_state = app.invoke( + { + "messages": [ + { + "role": "user", + "content": "Retrieve the documentation for MLflow. Keep the response concise and reply in Spanish. Try using as many tools as possible", + } + ] + }, +) +response = final_state["messages"][-1].get('content') + +# COMMAND ---------- + +final_state = app.invoke( + { + "messages": [ + { + "role": "user", + "content": f"Remember to always try using tools. Can you convert the following explanation to English? {response}", + } + ] + }, +) +final_state["messages"][-1].get('content') + +# COMMAND ---------- + +final_state = app.invoke( + {"messages": [{"role": "user", "content": "What is MLflow?"}]}, +) +final_state["messages"][-1].get('content') + +# COMMAND ---------- + +# DBTITLE 1,Log the model using MLflow +# MAGIC %%writefile app.py +# MAGIC from typing import Any, Generator, Optional, Sequence, Union, Literal +# MAGIC import mlflow +# MAGIC from databricks_langchain import ( +# MAGIC ChatDatabricks, +# MAGIC UCFunctionToolkit, +# MAGIC VectorSearchRetrieverTool, +# MAGIC ) +# MAGIC from langchain_core.language_models import LanguageModelLike +# MAGIC from langchain_core.runnables import RunnableConfig, RunnableLambda +# MAGIC from langchain_core.tools import BaseTool, tool +# MAGIC from langgraph.graph import START, END, StateGraph +# MAGIC from langgraph.prebuilt.tool_node import ToolNode +# MAGIC from mlflow.langchain.chat_agent_langgraph import ChatAgentState, ChatAgentToolNode +# MAGIC from mlflow.pyfunc import ChatAgent +# MAGIC from mlflow.types.agent import ( +# MAGIC ChatAgentChunk, +# MAGIC ChatAgentMessage, +# MAGIC ChatAgentResponse, +# MAGIC ChatContext, +# MAGIC ) +# MAGIC +# MAGIC uc_catalog = "{{.input_catalog_name}}" +# MAGIC schema = "{{.input_schema_name}}" +# MAGIC +# MAGIC python_execution_function_name = f"{uc_catalog}.{schema}.execute_python_code" +# MAGIC ask_ai_function_name = f"{uc_catalog}.{schema}.ask_ai" +# MAGIC summarization_function_name = f"{uc_catalog}.{schema}.summarize" +# MAGIC translate_function_name = f"{uc_catalog}.{schema}.translate" +# MAGIC +# MAGIC @tool +# MAGIC def retrieve_function(query: str) -> str: +# MAGIC """Retrieve from Databricks Vector Search using the query.""" +# MAGIC +# MAGIC index = f"{uc_catalog}.{schema}.databricks_documentation_vs_index" +# MAGIC +# MAGIC vs_tool = VectorSearchRetrieverTool( +# MAGIC index_name=index, +# MAGIC tool_name="vector_search_retriever", +# MAGIC tool_description="Retrieves information from Databricks Vector Search.", +# MAGIC embedding_model_name="databricks-bge-large-en", +# MAGIC num_results=1, +# MAGIC columns=["url", "content"], +# MAGIC query_type="ANN" +# MAGIC ) +# MAGIC +# MAGIC response = vs_tool.invoke(query) +# MAGIC return f"{response[0].metadata['url']} \n{response[0].page_content}" +# MAGIC +# MAGIC toolkit = UCFunctionToolkit( +# MAGIC function_names=[ +# MAGIC python_execution_function_name, +# MAGIC # ask_ai_function_name, # commenting out to showcase retriever +# MAGIC summarization_function_name, +# MAGIC translate_function_name, +# MAGIC ] +# MAGIC ) +# MAGIC uc_tools = toolkit.tools +# MAGIC tools = uc_tools + [retrieve_function] +# MAGIC +# MAGIC # Example for Databricks foundation model endpoints +# MAGIC model = ChatDatabricks(endpoint="databricks-meta-llama-3-3-70b-instruct") +# MAGIC system_prompt = "You are a Databricks expert. " +# MAGIC +# MAGIC def create_tool_calling_agent( +# MAGIC model: LanguageModelLike, +# MAGIC tools: Union[ToolNode, Sequence[BaseTool]], +# MAGIC system_prompt: Optional[str]=None +# MAGIC ): +# MAGIC model = model.bind_tools(tools) +# MAGIC +# MAGIC def should_continue(state: ChatAgentState) -> Literal["tools", END]: +# MAGIC messages = state["messages"] +# MAGIC last_message = messages[-1] +# MAGIC if last_message.get("tool_calls"): +# MAGIC return "tools" +# MAGIC return END +# MAGIC +# MAGIC preprocessor = RunnableLambda(lambda state: [{"role": "system", "content": system_prompt}] + state["messages"]) +# MAGIC model_runnable = preprocessor | model +# MAGIC +# MAGIC def call_model(state: ChatAgentState, config: RunnableConfig): +# MAGIC failing = True +# MAGIC retry = 10 +# MAGIC while failing and retry>=0: +# MAGIC try: +# MAGIC response = model_runnable.invoke(state, config) +# MAGIC failing = False +# MAGIC except: +# MAGIC retry -= 1 +# MAGIC return {"messages": [response]} +# MAGIC +# MAGIC workflow = StateGraph(ChatAgentState) +# MAGIC +# MAGIC tool_node = ChatAgentToolNode(tools) +# MAGIC workflow.add_node("agent", RunnableLambda(call_model)) +# MAGIC workflow.add_node("tools", tool_node) +# MAGIC workflow.add_edge(START, "agent") +# MAGIC workflow.add_conditional_edges("agent", should_continue) +# MAGIC workflow.add_edge("tools", "agent") +# MAGIC return workflow.compile() +# MAGIC +# MAGIC class LangGraphChatAgent(ChatAgent): +# MAGIC def __init__(self, agent): +# MAGIC self.agent = agent +# MAGIC +# MAGIC def predict( +# MAGIC self, +# MAGIC messages: list[ChatAgentMessage], +# MAGIC context: Optional[ChatContext] = None, +# MAGIC custom_inputs: Optional[dict[str, Any]] = None, +# MAGIC ) -> ChatAgentResponse: +# MAGIC request = {"messages": self._convert_messages_to_dict(messages)} +# MAGIC +# MAGIC messages = [] +# MAGIC for event in self.agent.stream(request, stream_mode="updates"): +# MAGIC for node_data in event.values(): +# MAGIC messages.extend( +# MAGIC ChatAgentMessage(**msg) for msg in node_data.get("messages", []) +# MAGIC ) +# MAGIC return ChatAgentResponse(messages=messages) +# MAGIC +# MAGIC def predict_stream( +# MAGIC self, +# MAGIC messages: list[ChatAgentMessage], +# MAGIC context: Optional[ChatContext] = None, +# MAGIC custom_inputs: Optional[dict[str, Any]] = None, +# MAGIC ) -> Generator[ChatAgentChunk, None, None]: +# MAGIC request = {"messages": self._convert_messages_to_dict(messages)} +# MAGIC for event in self.agent.stream(request, stream_mode="updates"): +# MAGIC for node_data in event.values(): +# MAGIC yield from ( +# MAGIC ChatAgentChunk(**{"delta": msg}) for msg in node_data["messages"] +# MAGIC ) +# MAGIC +# MAGIC # Create the agent object, and specify it as the agent object to use when +# MAGIC # loading the agent back for inference via mlflow.models.set_model() +# MAGIC mlflow.langchain.autolog() +# MAGIC agent = create_tool_calling_agent(model, tools, system_prompt) +# MAGIC AGENT = LangGraphChatAgent(agent) +# MAGIC mlflow.models.set_model(AGENT) + +# COMMAND ---------- + +import mlflow +from mlflow.models.resources import DatabricksFunction, DatabricksServingEndpoint, DatabricksVectorSearchIndex +from pkg_resources import get_distribution + +mlflow.set_experiment(experiment) + +resources = [ + DatabricksServingEndpoint(endpoint_name=agent_model_endpoint), + DatabricksFunction(f"{uc_catalog}.{schema}.execute_python_code"), + DatabricksFunction(f"{uc_catalog}.{schema}.ask_ai"), + DatabricksFunction(f"{uc_catalog}.{schema}.summarize"), + DatabricksFunction(f"{uc_catalog}.{schema}.translate"), + DatabricksVectorSearchIndex(index_name=f"{uc_catalog}.{schema}.{vector_search_index}") +] + +with mlflow.start_run(): + model_info = mlflow.pyfunc.log_model( + python_model="../notebooks/app.py", # Pass the path to the saved model file + name="model", + resources=resources, + pip_requirements=[ + f"databricks-connect=={get_distribution('databricks-connect').version}", + f"unitycatalog-langchain[databricks]=={get_distribution('unitycatalog-langchain[databricks]').version}", + f"databricks-vectorsearch=={get_distribution('databricks-vectorsearch').version}", + f"databricks-langchain=={get_distribution('databricks-langchain').version}", + f"langgraph=={get_distribution('langgraph').version}", + f"mlflow=={get_distribution('mlflow').version}", + ], + registered_model_name=f"{uc_catalog}.{schema}.{registered_model}" # Replace with your own model name + ) + +# COMMAND ---------- + +# DBTITLE 1,Set Alias +from mlflow import MlflowClient + +# Initialize MLflow client +client = MlflowClient() + +# Set an alias for new version of the registered model to retrieve it for model serving +client.set_registered_model_alias(f"{uc_catalog}.{schema}.{registered_model}", model_alias, model_info.registered_model_version) + + +# COMMAND ---------- + +# DBTITLE 1,Validate the model locally prior to serving +from mlflow.models import convert_input_example_to_serving_input, validate_serving_input + +serving_input = convert_input_example_to_serving_input( + {"messages": [{"role": "user", "content": "What is MLflow?"}]} +) +validate_serving_input(model_info.model_uri, serving_input=serving_input) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/ai_tools.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/ai_tools.py.tmpl new file mode 100644 index 00000000..28fa32c4 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent/tools/ai_tools.py.tmpl @@ -0,0 +1,74 @@ +# You can find additional AI built-in functions starting at https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_classify + +# execute_python_code +def execute_python_code(code: str) -> str: + """ + Executes the given python code and returns its stdout. + Remember the code should print the final result to stdout. + + Args: + code: Python code to execute. Remember to print the final result to stdout. + """ + import sys + from io import StringIO + + stdout = StringIO() + sys.stdout = stdout + try: + exec(code) + return stdout.getvalue() + except Exception as e: + if "Spark" in str(e): + return f"Python code execution failed: {e}. Databricks specific code is not allowed." + if "pyspark" in str(e): + return f"Python code execution failed: {e}. Databricks specific code is not allowed." + if "dbutils" in str(e): + return f"Python code execution failed: {e}. Databricks specific code is not allowed." + else: + return f"Python code execution failed: {e}. Use simple code + try again." + +# AI function name +ask_ai_function = """CREATE OR REPLACE FUNCTION {ask_ai_function_name}(question STRING COMMENT 'question to ask') +RETURNS STRING +COMMENT 'answer the question using chosen model' +RETURN SELECT ai_gen(question) +""" + +# Summarization function +summarization_function = """CREATE OR REPLACE FUNCTION {summarization_function_name}(text STRING COMMENT 'content to parse', max_words INT COMMENT 'max number of words in the response, must be non-negative integer, if set to 0 then no limit') +RETURNS STRING +COMMENT 'summarize the content and limit response to max_words' +RETURN SELECT ai_summarize(text, max_words) +""" + +# Translate function +translate_function = """CREATE OR REPLACE FUNCTION {translate_function_name}(content STRING COMMENT 'content to translate', language STRING COMMENT 'target language') +RETURNS STRING +COMMENT 'translate the content to target language, currently only english <-> spanish translation is supported' +RETURN SELECT ai_translate(content, language) +""" + +# Retrieve function +from databricks_langchain import VectorSearchRetrieverTool +from langchain_core.tools import tool +import os + +@tool +def retrieve_function(query: str) -> str: + """Retrieve from Databricks Vector Search using the query.""" + + index = f"{os.getenv('UC_CATALOG')}.{os.getenv('SCHEMA')}.{os.getenv('VECTOR_SEARCH_INDEX')}" + + # Define the Vector Search Retriever Tool + vs_tool = VectorSearchRetrieverTool( + index_name=index, # Replace with your index name + tool_name="vector_search_retriever", + tool_description="Retrieves information from Databricks Vector Search.", + embedding_model_name="databricks-bge-large-en", # Embedding model + num_results=1, # Number of results to return + columns=["url", "content"], # Columns to include in search results + query_type="ANN" # Query type (ANN or HYBRID) + ) + + response = vs_tool.invoke(query) + return f"{response[0].metadata['url']} \n{response[0].page_content}" \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/__init__.py.tmpl new file mode 100644 index 00000000..f8545c21 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/__init__.py.tmpl @@ -0,0 +1,9 @@ +"""Evaluation module for agent evaluation.""" + +from .evaluation import ( + get_reference_documentation +) + +__all__ = [ + "get_reference_documentation", +] diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/evaluation.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/evaluation.py.tmpl new file mode 100644 index 00000000..9b69cb08 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/evaluation/evaluation.py.tmpl @@ -0,0 +1,10 @@ +def get_reference_documentation(catalog, schema, table, spark): + import pandas as pd + from pyspark.sql.functions import col, struct + ref_docs = (spark.createDataFrame(pd.read_parquet('https://notebooks.databricks.com/demos/dbdemos-dataset/llm/databricks-documentation/databricks_doc_eval_set.parquet')) + .withColumnRenamed('request', 'inputs') + .withColumnRenamed('expected_response', 'expectations') + .withColumn('inputs', struct(col('inputs').alias('question'))) + .withColumn('expectations', struct(col('expectations').alias('expected_response')))) + + return ref_docs \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/notebooks/AgentEvaluation.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/notebooks/AgentEvaluation.py.tmpl new file mode 100644 index 00000000..32c9bc79 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_evaluation/notebooks/AgentEvaluation.py.tmpl @@ -0,0 +1,167 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################## +# Agent Evaluation +# +# Notebook that downloads an evaluation dataset and evaluates the model using +# llm-as-a-judge with the Databricks agent framework. +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * eval_table (required) - Name of the table containing the evaluation dataset +# * experiment (required) - Name of the experiment to register the run under +# * registered_model (required) - Name of the model registered in mlflow +# * model_alias (required) - Model alias to deploy +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Evaluation Table: Text widget to input the name of the table containing the evaluation dataset +# * Experiment: Text widget to input the name of the experiment to register the run under +# * Registered model name: Text widget to input the name of the model to register in mlflow +# * Model Alias: Text widget to input the model alias to deploy +# * Bundle root: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run to evaluate your agent. +# +################################################################################## + +# COMMAND ---------- + +# List of input args needed to run the notebook as a job. +# Provide them via DB widgets or notebook arguments. + +# A Unity Catalog containing the model +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of evaluation table +dbutils.widgets.text( + "eval_table", + "databricks_documentation_eval", + label="Evaluation dataset", +) +# Name of experiment to register under in mlflow +dbutils.widgets.text( + "experiment", + "agent_function_chatbot", + label="Experiment name", +) +# Name of model registered in mlflow +dbutils.widgets.text( + "registered_model", + "agent_function_chatbot", + label="Registered model name", +) +# Model alias +dbutils.widgets.text( + "model_alias", + "agent_latest", + label="Model Alias", +) +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + +# COMMAND ---------- + +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +eval_table = dbutils.widgets.get("eval_table") +experiment = dbutils.widgets.get("experiment") +registered_model = dbutils.widgets.get("registered_model") +model_alias = dbutils.widgets.get("model_alias") +bundle_root = dbutils.widgets.get("bundle_root") + +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert eval_table != "", "eval_table notebook parameter must be specified" +assert experiment != "", "experiment notebook parameter must be specified" +assert registered_model != "", "registered_model notebook parameter must be specified" +assert model_alias != "", "model_alias notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# COMMAND ---------- + +# DBTITLE 1,Create Evaluation Dataset +import mlflow.genai.datasets + +try: + eval_dataset = mlflow.genai.datasets.create_dataset( + uc_table_name=f"{uc_catalog}.{schema}.{eval_table}", + ) +except: + # Eval table already exists + eval_dataset = mlflow.genai.datasets.get_dataset( + uc_table_name=f"{uc_catalog}.{schema}.{eval_table}", + ) + +print(f"Evaluation dataset: {uc_catalog}.{schema}.{eval_table}") + +# COMMAND ---------- + +# DBTITLE 1,Get Reference Documentation +from agent_development.agent_evaluation.evaluation import get_reference_documentation + +reference_docs = get_reference_documentation(uc_catalog, schema, eval_table, spark) + +display(reference_docs) + +# COMMAND ---------- + +# DBTITLE 1,Merge Reference Docs to Eval Dataset +eval_dataset.merge_records(reference_docs.limit(100)) + +# Preview the dataset +df = eval_dataset.to_df() +print(f"\nDataset preview:") +print(f"Total records: {len(df)}") +print("\nSample record:") +sample = df.iloc[0] +print(f"Inputs: {sample['inputs']}") + +# COMMAND ---------- + +# DBTITLE 1,Run Evaluation +import mlflow +from mlflow.genai.scorers import scorer +from mlflow.genai.scorers import RetrievalRelevance, RetrievalGroundedness +import re + +# Workaround for serverless compatibility +mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc" + +model = mlflow.pyfunc.load_model(f"models:/{uc_catalog}.{schema}.{registered_model}@{model_alias}") +def evaluate_model(question): + return model.predict({"messages": [{"role": "user", "content": question}]}) + +mlflow.set_experiment(experiment) + +with mlflow.start_run(): + # Evaluate the logged model + eval_results = mlflow.genai.evaluate( + data=eval_dataset, + predict_fn=evaluate_model, + scorers=[RetrievalRelevance(), RetrievalGroundedness()], + ) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_requirements.txt.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_requirements.txt.tmpl new file mode 100644 index 00000000..abf7087d --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/agent_development/agent_requirements.txt.tmpl @@ -0,0 +1,6 @@ +unitycatalog-langchain[databricks]==0.2.0 +databricks-vectorsearch==0.56 +databricks-langchain==0.5.1 +langgraph==0.5.0 +mlflow==3.1.1 +databricks-agents==1.1.0 \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/README.md.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/README.md.tmpl new file mode 100644 index 00000000..51aa9e95 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/README.md.tmpl @@ -0,0 +1 @@ +# Data Ingestion \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/fetch_data.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/fetch_data.py.tmpl new file mode 100644 index 00000000..fa9bad86 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/ingestion/fetch_data.py.tmpl @@ -0,0 +1,85 @@ +""" +This sample module contains data ingestion logic for ingesting data from a URL and parsing the HTML content. +You should adapt the code based ont the HTML structure of your own data. The function returns a DataFrame with the parsed content. +""" + + +from pyspark.sql.types import StringType +from pyspark.sql.functions import col, udf, length, pandas_udf + + +from bs4 import BeautifulSoup +import xml.etree.ElementTree as ET +from concurrent.futures import ThreadPoolExecutor +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import pandas as pd + + +retries = Retry( + total=3, + backoff_factor=3, + status_forcelist=[429], +) + + +def fetch_data_from_url(spark, data_source_url, max_documents=None): + # Fetch the XML content from sitemap + response = requests.get(data_source_url) + root = ET.fromstring(response.content) + + # Find all 'loc' elements (URLs) in the XML + urls = [loc.text for loc in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")] + if max_documents: + urls = urls[:max_documents] + + # Create DataFrame from URLs + df_urls = spark.createDataFrame(urls, StringType()).toDF("url").repartition(10) + + # Pandas UDF to fetch HTML content for a batch of URLs + @pandas_udf("string") + def fetch_html_udf(urls: pd.Series) -> pd.Series: + adapter = HTTPAdapter(max_retries=retries) + http = requests.Session() + http.mount("http://", adapter) + http.mount("https://", adapter) + def fetch_html(url): + try: + response = http.get(url) + if response.status_code == 200: + return response.content + except requests.RequestException: + return None + return None + + with ThreadPoolExecutor(max_workers=200) as executor: + results = list(executor.map(fetch_html, urls)) + return pd.Series(results) + + # Pandas UDF to process HTML content and extract text + @pandas_udf("string") + def download_web_page_udf(html_contents: pd.Series) -> pd.Series: + def extract_text(html_content): + if html_content: + soup = BeautifulSoup(html_content, "html.parser") + article_div = soup.find("div", class_="theme-doc-markdown markdown") + if article_div: + return str(article_div).strip() + return None + + return html_contents.apply(extract_text) + + # Apply UDFs to DataFrame + df_with_html = df_urls.withColumn("html_content", fetch_html_udf("url")) + final_df = df_with_html.withColumn("text", download_web_page_udf("html_content")) + + # Select and filter non-null results + final_df = final_df.select("url", "text").filter("text IS NOT NULL") + if final_df.isEmpty(): + raise Exception("""Dataframe is empty, couldn't download Databricks documentation. + This is most likely caused by article_div = soup.find("div", class_="theme-doc-markdown markdown") in download_web_page_udf. + Please check the html of the documentation page you are trying to download and chance the filter accordingly. + """) + + return final_df \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl new file mode 100644 index 00000000..fecc04be --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl @@ -0,0 +1,119 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################### +# Data Ingestion Pipeline +# +# This pipeline is designed to process raw documentation data from a specified data source URL. +# The data is stored in a Unity Catalog within a specified database for later processing. +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog containing the input data +# * schema (required) - Name of the schema inside the Unity Catalog +# * raw_data_table (required) - Name of the raw data table inside the database of the Unity Catalog +# * data_source_url (required) - URL of the data source. Default is "https://docs.databricks.com/en/doc-sitemap.xml" +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Raw data table: Text widget to input the name of the raw data table inside the database of the Unity Catalog +# * Data Source URL: Text widget to input the URL of the data source +# * Root of bundle: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to collect and store the raw documentation data. +# +################################################################################## + +# COMMAND ---------- + +# DBTITLE 1,Widget creation +# List of input args needed to run this notebook as a job +# Provide them via DB widgets or notebook arguments in your DAB resources + +# A Unity Catalog containing the input data +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of raw data table +dbutils.widgets.text( + "raw_data_table", + "raw_documentation", + label="Raw data table", +) + +# Data source url +dbutils.widgets.text( + "data_source_url", + "https://docs.databricks.com/en/doc-sitemap.xml", + label="Data Source URL", +) + +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + +# COMMAND ---------- + +# DBTITLE 1,Define input and output variables +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +raw_data_table = dbutils.widgets.get("raw_data_table") +data_source_url = dbutils.widgets.get("data_source_url") +bundle_root = dbutils.widgets.get("bundle_root") + +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert raw_data_table != "", "raw_data_table notebook parameter must be specified" +assert data_source_url != "", "data_source_url notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# Updating to bundle root +import sys + +root = dbutils.widgets.get("bundle_root") +sys.path.append(root) + +# COMMAND ---------- + +# DBTITLE 1,Use the catalog and database specified in the notebook parameters +spark.sql(f"""CREATE SCHEMA IF NOT EXISTS `{uc_catalog}`.`{schema}`""") + +spark.sql(f"""USE `{uc_catalog}`.`{schema}`""") + +# COMMAND ---------- + +# DBTITLE 1,Download and store data to UC +from data_preparation.data_ingestion.ingestion.fetch_data import fetch_data_from_url + +if not spark.catalog.tableExists(f"{raw_data_table}") or spark.table(f"{raw_data_table}").isEmpty(): + # Download the data to a DataFrame + doc_articles = fetch_data_from_url(spark, data_source_url) + + #Save them as to unity catalog + doc_articles.write.mode('overwrite').saveAsTable(f"{raw_data_table}") + + doc_articles.display() + + +# COMMAND ---------- + +dbutils.notebook.exit(0) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/README.md.tmpl new file mode 100644 index 00000000..74440ffb --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/README.md.tmpl @@ -0,0 +1,2 @@ +# Data Preprocessing +To set up the data preprocessing job via scheduled Databricks workflow, please refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl new file mode 100644 index 00000000..0dc2e119 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl @@ -0,0 +1,194 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################### +# Data Preprocessing Pipeline +# +# This notebook shows an example of a Data Preprocessing pipeline using Unity Catalog. +# It is configured and can be executed as the tasks in the PreprocessRawData workflow defined under +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/data-preprocessing-workflow-resource.yml`` +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * raw_data_table (required) - Name of the raw data table inside UC database +# * preprocessed_data_table (required) - Name of the preprocessed data table inside UC database +# * hf_tokenizer_model (optional) - Name of the HuggingFace tokenizer model name +# * max_chunk_size (optional) - Maximum chunk size +# * min_chunk_size (optional) - Minimum chunk size +# * chunk_overlap (optional) - Overlap between chunks +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * schema: Text widget to input the name of the database inside the Unity Catalog +# * Raw data table: Text widget to input the name of the raw data table inside the database of the Unity Catalog +# * Preprocessed data table: Text widget to input the name of the preprocessed data table inside the database of the Unity Catalog +# * HuggingFace tokenizer model: Text widget to input the name of the hugging face tokenizer model to import +# * Maximum chunk size: Maximum characters chunks will be split into +# * Minimum chunk size: minimum characters chunks will be split into +# * Chunk overlap: Overlap between chunks +# * Root of bundle: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to chunk the raw data and store in Unity Catalog. +# +################################################################################## + +# COMMAND ---------- + +# List of input args needed to run this notebook as a job. +# Provide them via DB widgets or notebook arguments. + +# A Unity Catalog containing the input data +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of input table +dbutils.widgets.text( + "raw_data_table", + "raw_documentation", + label="Raw data table", +) +# Name of output table +dbutils.widgets.text( + "preprocessed_data_table", + "databricks_documentation", + label="Preprocessed data table", +) +# Name of huggingface tokenizer model +dbutils.widgets.text( + "hf_tokenizer_model", + "openai-community/openai-gpt", + label="HuggingFace tokenizer model", +) +# Maximum chunk size +dbutils.widgets.text("max_chunk_size", "500", label="Maximum chunk size") +# Minimum chunk size +dbutils.widgets.text("min_chunk_size", "20", label="Minimum chunk size") +# Chunk overlap +dbutils.widgets.text("chunk_overlap", "50", label="Chunk overlap") + +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + +# COMMAND ---------- + +# DBTITLE 1,Define input and output variables +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +raw_data_table = dbutils.widgets.get("raw_data_table") +preprocessed_data_table = dbutils.widgets.get("preprocessed_data_table") +hf_tokenizer_model = dbutils.widgets.get("hf_tokenizer_model") +max_chunk_size = int(dbutils.widgets.get("max_chunk_size")) +min_chunk_size = int(dbutils.widgets.get("min_chunk_size")) +chunk_overlap = int(dbutils.widgets.get("chunk_overlap")) +bundle_root = dbutils.widgets.get("bundle_root") + +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert raw_data_table != "", "raw_data_table notebook parameter must be specified" +assert preprocessed_data_table != "", "preprocessed_data_table notebook parameter must be specified" +assert hf_tokenizer_model != "", "hf_tokenizer_model notebook parameter must be specified" +assert max_chunk_size != "", "max_chunk_size notebook parameter must be specified" +assert min_chunk_size != "", "min_chunk_size notebook parameter must be specified" +assert chunk_overlap != "", "chunk_overlap notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# Updating to bundle root +import sys + +root = dbutils.widgets.get("bundle_root") +sys.path.append(root) + +# COMMAND ---------- + +# DBTITLE 1,Initialize tokenizer +# Download tokenizer model to UC volume +from transformers import AutoTokenizer + +volume_folder = f"/Volumes/{uc_catalog}/{schema}/volume_databricks_documentation" + +spark.sql(f"CREATE VOLUME IF NOT EXISTS {uc_catalog}.{schema}.volume_databricks_documentation") + +# Initialize tokenizer once +tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_model, cache_dir=f'{volume_folder}/hg_cache') + + +# COMMAND ---------- + +# DBTITLE 1, Use the catalog and database specified in the notebook parameters +spark.sql(f"""USE `{uc_catalog}`.`{schema}`""") + +# COMMAND ---------- + +# DBTITLE 1, Create output preprocessed data table +if not spark.catalog.tableExists(f"{preprocessed_data_table}") or spark.table(f"{preprocessed_data_table}").isEmpty(): + spark.sql(f""" + CREATE TABLE IF NOT EXISTS {preprocessed_data_table} ( + id BIGINT GENERATED ALWAYS AS IDENTITY, + url STRING, + content STRING + ) + TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true') + """) + + +# COMMAND ---------- + +# DBTITLE 1,Create a user-defined function (UDF) to chunk all our documents with spark. +from functools import partial +import pandas as pd +from pyspark.sql.functions import pandas_udf +from data_preparation.data_preprocessing.preprocessing.create_chunk import split_html_on_p + +@pandas_udf("array") +def parse_and_split( + docs: pd.Series +) -> pd.Series: + """Parse and split html content into chunks. + + :param docs: Input documents + :return: List of chunked text for each input document + """ + + return docs.apply(lambda html: split_html_on_p( + html, + tokenizer=tokenizer, + chunk_overlap=chunk_overlap, + min_chunk_size=min_chunk_size, + max_chunk_size=max_chunk_size + )) + +# COMMAND ---------- + +# DBTITLE 1,Perform data preprocessing. +from pyspark.sql import functions as F + +(spark.table(raw_data_table) + .filter('text is not null') + .withColumn('content', F.explode(parse_and_split('text'))) + .drop("text") + .write.mode('overwrite').saveAsTable(preprocessed_data_table)) + +# COMMAND ---------- + +dbutils.notebook.exit(0) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/__init__.py.tmpl new file mode 100644 index 00000000..e69de29b diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/create_chunk.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/create_chunk.py.tmpl new file mode 100644 index 00000000..c75e37f7 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/preprocessing/create_chunk.py.tmpl @@ -0,0 +1,84 @@ +""" +This sample module contains data preprocessing logic to chunk HTML text. +You should plug in your own data chunking logic in the split_html_on_p method below. +""" + +from langchain.text_splitter import ( + HTMLHeaderTextSplitter, + RecursiveCharacterTextSplitter, +) +from lxml import etree + + +def get_splitters(tokenizer, max_chunk_size: int, chunk_overlap: int): + """Initialize splitters with the shared tokenizer. + + :param max_chunk_size: The maximum size of a chunk. + :param chunk_overlap: Target overlap between chunks. + Overlapping chunks helps to mitigate loss of information when context is divided between chunks. + :return: A tuple of text splitter and html text splitter + """ + text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( + tokenizer, chunk_size=max_chunk_size, chunk_overlap=chunk_overlap + ) + html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=[("p", "paragraph")]) + return text_splitter, html_splitter + + +def split_html_on_p( + html: str, + tokenizer, + chunk_overlap: int = 50, + min_chunk_size: int = 20, + max_chunk_size: int = 500, +): + try: + """Parse and split HTML content into chunks. + + Split on

, but merge small paragraph chunks together to avoid too small. + It uses HTMLHeaderTextSplitter to parse the HTML content and + RecursiveCharacterTextSplitter to split the text into chunks + + TODO: Update and adapt the sample code for your use case + + :param html: HTML content + :param chunk_overlap: Target overlap between chunks. + Overlapping chunks helps to mitigate loss of information when context is divided between chunks. + :param min_chunk_size: The minimum size of a chunk. + :param max_chunk_size: The maximum size of a chunk. + :return: List of chunked text for input HTML content + """ + if not html: + return [] + + # Get splitters + text_splitter, html_splitter = get_splitters( + tokenizer, max_chunk_size, chunk_overlap + ) + + p_chunks = html_splitter.split_text(html) + chunks = [] + previous_chunk = "" + + # Merge chunks together to add text before

and avoid too small docs. + for c in p_chunks: + # Concat the paragraph + content = c.page_content + if len(tokenizer.encode(previous_chunk + content)) <= max_chunk_size / 2: + previous_chunk += content + "\n" + else: + chunks.extend(text_splitter.split_text(previous_chunk.strip())) + previous_chunk = content + "\n" + + if previous_chunk: + chunks.extend(text_splitter.split_text(previous_chunk.strip())) + + # Discard chunks smaller than min_chunk_size + return [c for c in chunks if len(tokenizer.encode(c)) > min_chunk_size] + + except etree.XSLTApplyError as e: + print(f"XSLTApplyError: {e}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl new file mode 100644 index 00000000..4ef3434f --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl @@ -0,0 +1,9 @@ +# Vector Search + +To enable vector search as part of a scheduled Databricks workflow, please: +- Update all the TODOs in the [vector search resource file](../resources/vector-search-resource.yml). +- Uncomment the vector search workflow from the main Databricks Asset Bundles file [databricks.yml](../databricks.yml). + +For more details, refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md). + +This workflow supports the building of a vector index given a source table. \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl new file mode 100644 index 00000000..14865f5d --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl @@ -0,0 +1,149 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 +# MAGIC # Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules +# MAGIC # To disable autoreload; run %autoreload 0 + +# COMMAND ---------- + +################################################################################### +# Vector Search +# +# This notebook creates a Vector Search index from a table containing chunked documents. +# +# Parameters: +# * uc_catalog (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * preprocessed_data_table (required) - Name of the preprocessed data table inside database of Unity Catalog +# * vector_search_endpoint (required) - Name of the Vector Search endpoint +# * bundle_root (required) - Root of the bundle +# +# Widgets: +# * Vector Search endpoint: Text widget to input the name of the Vector Search endpoint +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Preprocessed data table: Text widget to input the name of the preprocessed data table inside the database of Unity Catalog +# * Root of bundle: Text widget to input the root of the bundle +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to set up the vector search endpoint. +# 3. Create index. +# +################################################################################## + +# COMMAND ---------- + +# List of input args needed to run this notebook as a job. +# Provide them via DB widgets or notebook arguments in your DAB resources. + +# A Unity Catalog location containing the input data +dbutils.widgets.text( + "uc_catalog", + "ai_agent_stacks", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "ai_agent_ops", + label="Schema", +) +# Name of preprocessed data table +dbutils.widgets.text( + "preprocessed_data_table", + "databricks_documentation", + label="Preprocessed data table", +) +# A Vector Search Endpoint for retrieving processed data +dbutils.widgets.text( + "vector_search_endpoint", + "ai_agent_endpoint", + label="Vector Search endpoint", +) +# Bundle root +dbutils.widgets.text( + "bundle_root", + "/", + label="Root of bundle", +) + + +# COMMAND ---------- + +# DBTITLE 1,Define variables +vector_search_endpoint = dbutils.widgets.get("vector_search_endpoint") +uc_catalog = dbutils.widgets.get("uc_catalog") +schema = dbutils.widgets.get("schema") +preprocessed_data_table = dbutils.widgets.get("preprocessed_data_table") +bundle_root = dbutils.widgets.get("bundle_root") + +assert vector_search_endpoint != "", "vector_search_endpoint notebook parameter must be specified" +assert uc_catalog != "", "uc_catalog notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert preprocessed_data_table != "", "preprocessed_data_table notebook parameter must be specified" +assert bundle_root != "", "bundle_root notebook parameter must be specified" + +# Updating to bundle root +import sys + +root = dbutils.widgets.get("bundle_root") +sys.path.append(root) + +# COMMAND ---------- + +# DBTITLE 1,Initialize endpoint +from databricks.vector_search.client import VectorSearchClient +from data_preparation.vector_search.vector_search_utils.utils import vs_endpoint_exists, wait_for_vs_endpoint_to_be_ready + +vsc = VectorSearchClient(disable_notice=True) + +if not vs_endpoint_exists(vsc, vector_search_endpoint): + vsc.create_endpoint(name=vector_search_endpoint, endpoint_type="STANDARD") + +# this may throw an error on the first pass, once the endpoint is created we'd see correct messages +wait_for_vs_endpoint_to_be_ready(vsc, vector_search_endpoint) +print(f"Endpoint named {vector_search_endpoint} is ready.") + +# COMMAND ---------- + +# DBTITLE 1,Create Index +from data_preparation.vector_search.vector_search_utils.utils import index_exists, wait_for_index_to_be_ready +from databricks.sdk import WorkspaceClient +import databricks.sdk.service.catalog as c + +# The table we'd like to index +source_table_fullname = f"{uc_catalog}.{schema}.{preprocessed_data_table}" + +# Where we want to store our index +vs_index_fullname = f"{uc_catalog}.{schema}.{preprocessed_data_table}_vs_index" + +if not index_exists(vsc, vector_search_endpoint, vs_index_fullname): + print(f"Creating index {vs_index_fullname} on endpoint {vector_search_endpoint}...") + vsc.create_delta_sync_index( + endpoint_name=vector_search_endpoint, + index_name=vs_index_fullname, + source_table_name=source_table_fullname, + pipeline_type="TRIGGERED", + primary_key="id", + embedding_source_column="content", # The column containing our text + embedding_model_endpoint_name="databricks-gte-large-en" # The embedding endpoint used to create the embeddings + ) + #Let's wait for the index to be ready and all our embeddings to be created and indexed + vsc.get_index(vector_search_endpoint, vs_index_fullname).wait_until_ready() +else: + #Trigger a sync to update our vs content with the new data saved in the table + vsc.get_index(vector_search_endpoint, vs_index_fullname).sync() + +print(f"Index {vs_index_fullname} on table {source_table_fullname} is ready") + +# COMMAND ---------- + +# DBTITLE 1,Test if Index Online +import databricks +import time +from data_preparation.vector_search.vector_search_utils.utils import check_index_online + +vector_index=vsc.get_index(endpoint_name=vector_search_endpoint, index_name=vs_index_fullname) + +check_index_online(vs_index_fullname, vector_index) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/__init__.py.tmpl new file mode 100644 index 00000000..1bdf830e --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/__init__.py.tmpl @@ -0,0 +1,17 @@ +"""Vector Search utilities for data preparation.""" + +from .utils import ( + vs_endpoint_exists, + wait_for_vs_endpoint_to_be_ready, + index_exists, + wait_for_index_to_be_ready, + check_index_online, +) + +__all__ = [ + "vs_endpoint_exists", + "wait_for_vs_endpoint_to_be_ready", + "index_exists", + "wait_for_index_to_be_ready", + "check_index_online", +] \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/utils.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/utils.py.tmpl new file mode 100644 index 00000000..73459b74 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/vector_search_utils/utils.py.tmpl @@ -0,0 +1,65 @@ +import databricks +import time + +def vs_endpoint_exists(vsc, endpoint_name): + try: + vsc.get_endpoint(endpoint_name) + return True + except Exception as e: + if 'Not Found' in str(e): + print(f'Unexpected error describing the endpoint. Try deleting it? vsc.delete_endpoint({endpoint_name}) and rerun the previous cell') + raise e + return False + +def wait_for_vs_endpoint_to_be_ready(vsc, vs_endpoint_name): + for i in range(180): + endpoint = vsc.get_endpoint(vs_endpoint_name) + status = endpoint.get("endpoint_status", endpoint.get("status"))["state"].upper() + if "ONLINE" in status: + return endpoint + elif "PROVISIONING" in status or i <6: + if i % 20 == 0: + print(f"Waiting for endpoint to be ready, this can take a few min... {endpoint}") + time.sleep(10) + else: + raise Exception(f'''Error with the endpoint {vs_endpoint_name}. - this shouldn't happen: {endpoint}.\n Please delete it and re-run the previous cell: vsc.delete_endpoint("{vs_endpoint_name}")''') + raise Exception(f"Timeout, your endpoint isn't ready yet: {vsc.get_endpoint(vs_endpoint_name)}") + + +def index_exists(vsc, endpoint_name, index_full_name): + try: + vsc.get_index(endpoint_name, index_full_name).describe() + return True + except Exception as e: + if 'RESOURCE_DOES_NOT_EXIST' not in str(e): + print(f'Unexpected error describing the index. This could be a permission issue. Try deleting it? vsc.delete_index({index_full_name})') + raise e + return False + +def wait_for_index_to_be_ready(vsc, vs_endpoint_name, index_name): + for i in range(180): + idx = vsc.get_index(vs_endpoint_name, index_name).describe() + index_status = idx.get('status', idx.get('index_status', {})) + status = index_status.get('status', 'UNKOWN').upper() + url = index_status.get('index_url', index_status.get('url', 'UNKOWN')) + if "ONLINE" in status: + return idx + if "UNKOWN" in status: + print(f"Can't get the status - will assume index is ready {idx} - url: {url}") + return idx + elif "PROVISIONING" in status: + if i % 20 == 0: print(f"Waiting for index to be ready, this can take a few min... {index_status} - pipeline url:{url}") + time.sleep(10) + else: + raise Exception(f'''Error with the index - this shouldn't happen. DLT pipeline might have been killed.\n Please delete it and re-run the previous cell: vsc.delete_index("{index_name}, {vs_endpoint_name}") \nIndex details: {idx}''') + raise Exception(f"Timeout, your index isn't ready yet: {vsc.get_index(index_name, vs_endpoint_name)}") + +def check_index_online(vs_index_fullname: str, vector_index: databricks.vector_search.index.VectorSearchIndex): + for i in range(180): + status = vector_index.describe()['status']["detailed_state"] + if (status != "ONLINE" and status != "ONLINE_NO_PENDING_UPDATE"): + print(f"Syncing {vs_index_fullname}") + time.sleep(10) + else: + print(f"{vs_index_fullname} is now synced") + return \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks_agentops.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks_agentops.yml.tmpl new file mode 100644 index 00000000..9a912394 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks_agentops.yml.tmpl @@ -0,0 +1,93 @@ +# The name of the bundle. run `databricks bundle schema` to see the full bundle settings schema. +bundle: + name: {{ .input_project_name }} + +variables: + uc_catalog: + description: Unity Catalog used to store data and artifacts. + default: {{ .input_catalog_name }} + schema: + description: "Schema in Unity Catalog." + default: {{ .input_schema_name }} + raw_data_table: + description: "Table in Unity Catalog to store raw data." + default: raw_documentation + data_source_url: + description: "The url to scrape." + default: https://docs.databricks.com/en/doc-sitemap.xml + preprocessed_data_table: + description: "Table in Unity Catalog to store preprocessed data." + default: databricks_documentation + min_chunk_size: + description: "Minimum size of chunks for vectorization." + default: 20 + max_chunk_size: + description: "Maximum size of chunks for vectorization." + default: 500 + chunk_overlap: + description: "Overlap of chunks for vectorization." + default: 50 + hf_tokenizer_model: + description: "Hugging Face Tokenizer Model to use for vectorization." + default: "openai-community/openai-gpt" + vector_search_endpoint: + description: "Vector Search endpoint to create." + default: ai_agent_endpoint + vector_search_index: + description: "Vector Search index to populate." + default: databricks_documentation_vs_index + experiment: + description: "Experiment to log run under." + default: /Users/${workspace.current_user.userName}/agent_function_chatbot + registered_model: + description: "Name of agent model to register." + default: agent_function_chatbot + max_words: + description: "Maximum number of words for agent to respond with." + default: 20 + agent_model_endpoint: + description: "Foundation model endpoint to use for agent." + default: "databricks-meta-llama-3-3-70b-instruct" + eval_table: + description: "Table in Unity Catalog to store evaluation data." + default: "databricks_documentation_eval" + model_alias: + description: "Model alias to use for trained model" + default: agent_latest + scale_to_zero: + description: "Scale model endpoint to zero when not in use." + default: True + workload_size: + description: "Size of compute for anticipated number of concurrent requests." + default: "Small" + chatbot_name: + description: "Name of the Databricks App" + default: "dash-chatbot-app" + +include: + # Resources folder contains Agent artifact resources for the Agent project that defines the agent + # and workflows resources for data preparation -> agent creation & evaluation -> deployment + - ./resources/data-preparation-resource.yml + - ./resources/agents-artifacts-resource.yml + - ./resources/agent-resource.yml + - ./resources/app-deployment-resource.yml + +# Deployment Target specific values for workspace +targets: + dev: + default: true + workspace: + # TODO: add dev workspace URL + host: + + staging: + workspace: + host: {{template `databricks_staging_workspace_host` .}} + + prod: + workspace: + host: {{template `databricks_prod_workspace_host` .}} + + test: + workspace: + host: {{template `databricks_staging_workspace_host` .}} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements_agentops.txt.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements_agentops.txt.tmpl new file mode 100644 index 00000000..c3c9c152 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements_agentops.txt.tmpl @@ -0,0 +1,16 @@ +mlflow==3.1.1 +numpy>=1.23.0 +pandas==1.5.3 +scikit-learn>=1.1.1 +matplotlib>=3.5.2 +pillow>=10.0.1 +Jinja2==3.0.3 +pyspark~=3.3.0 +pytz~=2022.2.1 +pytest>=7.1.2 +databricks-sdk==0.58.0 +beautifulsoup4==4.13.3 +transformers==4.41.1 +langchain==0.2.1 +databricks-vectorsearch==0.56 +lxml==5.4.0 diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README_agentops.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README_agentops.md.tmpl new file mode 100644 index 00000000..46a9dc46 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README_agentops.md.tmpl @@ -0,0 +1,227 @@ +# Databricks Agent Resource Configurations +[(back to project README)](../README.md) + +## Table of contents +* [Intro](#intro) +* [Local development and dev workspace](#local-development-and-dev-workspace) +* [Develop and test config changes](#develop-and-test-config-changes) +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} +* [CI/CD](#set-up-cicd) +* [Deploy initial ML resources](#deploy-initial-ml-resources) +* [Deploy config changes](#deploy-config-changes) +{{- end }} + +## Intro + +### databricks CLI bundles +AgentOps Stacks resources are configured and deployed through [Databricks CLI bundles]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/bundle-cli.html")) }}). +The bundle setting file must be expressed in YAML format and must contain at minimum the top-level bundle mapping. + +The Databricks CLI bundles top level is defined by file `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml`. +During Databricks CLI bundles deployment, the root config file will be loaded, validated and deployed to workspace provided by the environment together with all the included resources. + +Agent Resource Configurations in this directory: + - data preparation workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml`) + - agent development and deployment workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/agent-resource.yml`) + - app deployment workflow (`{{template `project_name_alphanumeric_underscore` .}}/resources/app-deployment-resource.yml`) + - model definition, experiment, and app definition (`{{template `project_name_alphanumeric_underscore` .}}/resources/agents-artifacts-resource.yml`) + + +### Deployment Config & CI/CD integration +The agent resources can be deployed to a Databricks workspace based on the Databricks CLI bundles deployment config. Deployment configs of different deployment targets share the general agent resource configurations with added ability to specify deployment target specific values (workspace URI, model name, jobs notebook parameters, etc). + +{{- if (eq .input_setup_cicd_and_project `Project_Only`) }} + +NOTE: This project was not setup with CI/CD workflows. You can setup CI/CD with a new initialization of AgentOps Stacks. The rest of this section only applies if you are using a monorepo setup with CI/CD previously or have setup CI/CD otherwise. +{{- else }} +This project ships with CI/CD workflows for developing and deploying agent resource configurations based on deployment config. +{{- end }} + +When you initialize the stack, we set the catalog name in the `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml`, so we expect a catalog of the same name in each environment. I + +If you want to use different catalog names, please set the variable `uc_catalog` under each target environment: + +``` +targets: + dev: + variables: + uc_catalog: + description: Unity Catalog used to store data and artifacts. + default: + +``` + +| Deployment Target | Description | Databricks Workspace | +|-------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------| +| dev | The `dev` deployment target is used to deploy resources to development workspace with `dev` configs. The config is for project development purposes. | dev workspace | dev-{{template `model_name` .}} | /dev-{{template `experiment_base_name` .}} | +| staging | The `staging` deployment target is part of the CD pipeline. Latest {{ .input_default_branch }} content will be deployed to staging workspace with `staging` config. | staging workspace | staging-{{template `model_name` .}} | /staging-{{template `experiment_base_name` .}} | +| prod | The `prod` deployment target is part of the CD pipeline. Latest {{ .input_release_branch }} content will be deployed to prod workspace with `prod` config. | prod workspace | prod-{{template `model_name` .}} | /prod-{{template `experiment_base_name` .}} | +| test | The `test` deployment target is part of the CI pipeline. For changes targeting the {{ .input_default_branch }} branch, upon making a PR, an integration test will be triggered and agent resources deployed to the staging workspace defined under `test` deployment target. | staging workspace | test-{{template `model_name` .}} | /test-{{template `experiment_base_name` .}} | + +During code development, you can deploy local resource configurations together with code to the Databricks workspace to run the ingestion, development, or deployment pipelines. The deployment will use `dev` config by default. + +You can open a PR (pull request) to modify code or the resource config against {{ .input_default_branch }} branch. +The PR will trigger Python unit tests, followed by an integration test executed on the staging workspace, as defined under the `test` environment resource. + +Upon merging a PR to the {{ .input_default_branch }} branch, the {{ .input_default_branch }} branch content will be deployed to the staging workspace with `staging` environment resource configurations. + +Upon merging code into the release branch, the release branch content will be deployed to prod workspace with `prod` environment resource configurations. + +## Local development and dev workspace + +### Set up authentication + +To set up the Databricks CLI using a Databricks personal access token, take the following steps: + +1. Follow [Databricks CLI]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/databricks-cli.html")) }}) to download and set up the Databricks CLI locally. +2. Complete the `TODO` in `{{template `project_name_alphanumeric_underscore` .}}/databricks.yml` to add the dev workspace URI under `targets.dev.workspace.host`. +3. [Create a personal access token]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/auth/pat.html")) }}) + in your dev workspace and copy it. +4. Set an env variable `DATABRICKS_TOKEN` with your Databricks personal access token in your terminal. For example, run `export DATABRICKS_TOKEN=dapi12345` if the access token is dapi12345. +5. You can now use the Databricks CLI to validate and deploy resource configurations to the dev workspace. + +Alternatively, you can use the other approaches described in the [Databricks CLI]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/cli/databricks-cli.html")) }}) documentation to set up authentication. For example, using your Databricks username/password, or seting up a local profile. + +### Validate and provision agent resource configurations +1. After installing the Databricks CLI and creating the `DATABRICKS_TOKEN` env variable, change to the `{{template `project_name_alphanumeric_underscore` .}}` directory. +2. Run `databricks bundle validate` to validate the Databricks resource configurations. +3. Run `databricks bundle deploy` to provision the Databricks resource configurations to the dev workspace. The resource configurations and your code will be copied together to the dev workspace. The defined resources such as Databricks Workflows, Registered Model and MLflow Experiment will be provisioned according to the config files under `{{template `project_name_alphanumeric_underscore` .}}/resources`. +4. Go to the Databricks dev workspace, check the defined model, experiment and workflows status, and interact with the created workflows. + +### Destroy resource configurations +After development is done, you can run `databricks bundle destroy` to destroy (remove) the defined Databricks resources in the dev workspace. + +Any model version will prevent the model from being deleted. Please update the version stage to `None` or `Archived` before destroying the ML resources. + +In addition, currently, there are a number of assets not managed by the bundle that cannot be deleted by the `destroy` command, including: + +- Agent deployment. Additionally, assets created by the deployment (the Feedback registered model, the CPU endpoint for the agent and feedback model, and the payload table). +- Data assets, including the volume with Databricks documentation and the three Delta tables. +- Vector Search assets (the endpoint and index). +- UC Functions. + +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} +## Set up CI/CD +Please refer to [agentops-setup](../../docs/agentops-setup.md#configure-cicd) for instructions to set up CI/CD. + +## Deploy initial agent resources +After completing the prerequisites, create and push a PR branch adding all files to the Git repo: +``` +git checkout -b add-agent-resource-config-and-code +git add . +git commit -m "Add Agent resource config and ML code" +git push upstream add-agent-resource-config-and-code +``` +Open a pull request to merge the pushed branch into the `{{ .input_default_branch }}` branch. +Upon creating this PR, the CI workflows will be triggered. +These CI workflow will run unit and integration tests of the agent code, +in addition to validating the Databricks resources to be deployed to both staging and prod workspaces. +Once CI passes, merge the PR into the `{{ .input_default_branch }}` branch. This will deploy an initial set of Databricks resources to the staging workspace. +resources will be deployed to the prod workspace on pushing code to the `{{ .input_release_branch }}` branch. +{{- end }} + +## Develop and test config changes + +### Databricks CLI bundles schema overview +To get started, open `{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml`. The file contains the agent resource definition of a data ingestion job, like: + +```$xslt +resources: + jobs: + data_preprocessing_job: + name: ${bundle.target}-{{ .input_project_name }}-data-preprocessing-job + tasks: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_ingestion/notebooks/DataIngestion.py + base_parameters: + # TODO modify these arguments to reflect your setup. + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + data_source_url: https://docs.databricks.com/en/doc-sitemap.xml + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + ... +``` + +The example above defines a Databricks job with name `${bundle.target}-{{ .input_project_name }}-data-preprocessing-job` +that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py` to ingest documents from our website. + +As this is running on serverless, there is no need for cluster definitions. If you are deploying to a non-serverless workspace, please reference the [MLOps Stacks Resource README](https://github.com/databricks/mlops-stacks/blob/main/template/%7B%7B.input_root_dir%7D%7D/%7B%7Btemplate%20%60project_name_alphanumeric_underscore%60%20.%7D%7D/resources/README.md.tmpl). + +We specify a `data_preprocessing_job` under `resources/jobs` to define a Databricks workflow with internal key `data_preprocessing_job` and job name `{bundle.target}-{{ .input_project_name }}-data-preprocessing-job`. +The workflow contains a single task with task key `data_preprocessing_job`. The task runs notebook `{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py` with provided parameters `uc_catalog`, `schema`, `raw_data_table`, and `data_source_url` passing to the notebook. +After setting up Databricks CLI, you can run command `databricks bundle schema` to learn more about Databricks CLI bundles schema. + +The notebook_path is the relative path starting from the resource yaml file. + +### Environment config based variables +The `${bundle.target}` will be replaced by the environment config name during the bundle deployment. For example, during the deployment of a `test` environment config, the job name will be +`test-{{ .input_project_name }}--data-preprocessing-job`. During the deployment of the `staging` environment config, the job name will be +`staging-{{ .input_project_name }}--data-preprocessing-job`. + + +To use different values based on different environment, you can use bundle variables based on the given target, for example, +```$xslt +variables: + raw_data_table: + description: The table name to be used for storing the raw data. + default: input_table + +targets: + dev: + variables: + raw_data_table: dev_table + test: + variables: + raw_data_table: test_table + +resources: + jobs: + data_preprocessing_job: + name: ${bundle.target}-{{ .input_project_name }}-data-preprocessing-job + tasks: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_ingestion/notebooks/DataIngestion.py + base_parameters: + # TODO modify these arguments to reflect your setup. + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + data_source_url: https://docs.databricks.com/en/doc-sitemap.xml + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + ... +``` +The `data_preprocessing_job` notebook parameter `raw_data_table` is using a bundle variable `raw_data_table` with default value "input_table". +The variable value will be overwritten with "dev_table" for `dev` environment config and "test_table" for `test` environment config: +- during deployment with the `dev` environment config, the `input_table_name` parameter will get the value "dev_table" +- during deployment with the `staging` environment config, the `input_table_name` parameter will get the value "input_table" +- during deployment with the `prod` environment config, the `input_table_name` parameter will get the value "input_table" +- during deployment with the `test` environment config, the `input_table_name` parameter will get the value "test_table" + +### Test config changes +To test out a config change, simply edit one of the fields above. + +Then follow [Local development and dev workspace](#local-development-and-dev-workspace) to deploy the change to the dev workspace. +Alternatively you can open a PR. Continuous integration will then validate the updated config and deploy tests to the to staging workspace. + +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} +## Deploy config changes + +### Dev workspace deployment +Please refer to [Local development and dev workspace](#local-development-and-dev-workspace). + +### Test workspace deployment(CI) +After setting up CI/CD, PRs against the {{ .input_default_branch }} branch will trigger CI workflows to run unit tests, integration test and resource validation. +The integration test will deploy MLflow model, MLflow experiment and Databricks workflow resources defined under the `test` environment resource config to the staging workspace. The integration test then triggers a run of the workflows to verify the code. + +### Staging and Prod workspace deployment(CD) +After merging a PR to the {{ .input_default_branch }} branch, continuous deployment automation will deploy the `staging` resources to the staging workspace. + +When you about to cut a release, you can create and merge a PR to merge changes from {{ .input_default_branch }} to {{ .input_release_branch }}. Continuous deployment automation will deploy `prod` resources to the prod workspace. +{{- end }} + +[Back to project README](../README.md) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agent-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agent-resource.yml.tmpl new file mode 100644 index 00000000..e2508616 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agent-resource.yml.tmpl @@ -0,0 +1,82 @@ +common_permissions: &permissions + permissions: + - level: CAN_VIEW + group_name: users + + +resources: + jobs: + agent_development_job: + parameters: + - name: bundle_root + default: ${workspace.file_path} + name: ${bundle.target}-{{ .input_project_name }}-agent-development-job + tasks: + - task_key: AgentDevelopment + notebook_task: + notebook_path: ../agent_development/agent/notebooks/Agent.py + base_parameters: + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + vector_search_endpoint: ${var.vector_search_endpoint} + vector_search_index: ${var.vector_search_index} + experiment: ${var.experiment} + registered_model: ${var.registered_model} + agent_model_endpoint: ${var.agent_model_endpoint} + max_words: ${var.max_words} + model_alias: ${var.model_alias} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: agent_requirements + + - task_key: AgentEvaluation + depends_on: + - task_key: AgentDevelopment + notebook_task: + notebook_path: ../agent_development/agent_evaluation/notebooks/AgentEvaluation.py + base_parameters: + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + experiment: ${var.experiment} + registered_model: ${var.registered_model} + eval_table: ${var.eval_table} + model_alias: ${var.model_alias} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: agent_requirements + + - task_key: AgentDeployment + depends_on: + - task_key: AgentEvaluation + notebook_task: + notebook_path: ../agent_deployment/model_serving/notebooks/ModelServing.py + base_parameters: + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + registered_model: ${var.registered_model} + model_alias: ${var.model_alias} + scale_to_zero: ${var.scale_to_zero} + workload_size: ${var.workload_size} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: agent_requirements + + schedule: + quartz_cron_expression: "0 0 6 * * ?" # daily at 6am + timezone_id: UTC + + environments: + - environment_key: agent_requirements + spec: + client: "3" + dependencies: + - "-r /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target}/files/agent_development/agent_requirements.txt" + + <<: *permissions + # If you want to turn on notifications for this job, please uncomment the below code, + # and provide a list of emails to the on_failure argument. + # + # email_notifications: + # on_failure: + # - first@company.com + # - second@company.com \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agents-artifacts-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agents-artifacts-resource.yml.tmpl new file mode 100644 index 00000000..cd744cb7 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/agents-artifacts-resource.yml.tmpl @@ -0,0 +1,40 @@ +# Allow users to read the experiment +common_permissions: &permissions + permissions: + - level: CAN_READ + group_name: users + +# Allow users to execute models in Unity Catalog +grants: &grants + grants: + - privileges: + - EXECUTE + principal: account users + +# Defines model and experiments +resources: + registered_models: + model: + name: ${var.registered_model} + catalog_name: ${var.uc_catalog} + schema_name: ${var.schema} + comment: Registered model in Unity Catalog for the "mlops-example-project" ML Project for ${bundle.target} deployment target. + <<: *grants + + feedback_model: + name: feedback + catalog_name: ${var.uc_catalog} + schema_name: ${var.schema} + comment: Registered model for the agent's feedback for ${bundle.target} deployment target. + <<: *grants + + experiments: + experiment: + name: ${var.experiment} + <<: *permissions + + apps: + dash-chatbot-app: + name: ${var.chatbot_name} + source_code_path: /Users/${workspace.current_user.userName}/.bundle/my_agent_project/${bundle.target}/files/agent_deployment/chat_interface_deployment/ + description: 'your Databricks assistant app.' diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/app-deployment-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/app-deployment-resource.yml.tmpl new file mode 100644 index 00000000..ab956a30 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/app-deployment-resource.yml.tmpl @@ -0,0 +1,36 @@ +common_permissions: &permissions + permissions: + - level: CAN_VIEW + group_name: users + + +resources: + jobs: + app_deployment_job: + parameters: + - name: bundle_root + default: ${workspace.file_path} + name: ${bundle.target}-{{ .input_project_name }}-app-deployment-job + tasks: + - task_key: AppDeployment + notebook_task: + notebook_path: ../agent_deployment/chat_interface_deployment/LaunchApp.py + base_parameters: + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + registered_model: ${var.registered_model} + agent_model_endpoint: ${var.agent_model_endpoint} + app_name: ${var.chatbot_name} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + schedule: + quartz_cron_expression: "0 0 8 * * ?" # daily at 8am + timezone_id: UTC + <<: *permissions + # If you want to turn on notifications for this job, please uncomment the below code, + # and provide a list of emails to the on_failure argument. + # + # email_notifications: + # on_failure: + # - first@company.com + # - second@company.com \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl new file mode 100644 index 00000000..99ceed39 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl @@ -0,0 +1,79 @@ +common_permissions: &permissions + permissions: + - level: CAN_VIEW + group_name: users + + +resources: + jobs: + data_preprocessing_job: + parameters: + - name: bundle_root + default: ${workspace.file_path} + name: ${bundle.target}-{{ .input_project_name }}-data-preprocessing-job + tasks: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_ingestion/notebooks/DataIngestion.py + base_parameters: + # TODO modify these arguments to reflect your setup. + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + data_source_url: https://docs.databricks.com/en/doc-sitemap.xml + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: data_prep_requirements + + - task_key: PreprocessRawData + depends_on: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_preprocessing/notebooks/DataPreprocessing.py + base_parameters: + # TODO modify these arguments to reflect your setup. + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + preprocessed_data_table_name: ${var.preprocessed_data_table} + max_chunk_size: ${var.max_chunk_size} + min_chunk_size: ${var.min_chunk_size} + chunk_overlap: ${var.min_chunk_size} + hf_tokenizer_model: ${var.hf_tokenizer_model} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: data_prep_requirements + + - task_key: VectorSearchIndex + depends_on: + - task_key: PreprocessRawData + notebook_task: + notebook_path: ../data_preparation/vector_search/notebooks/VectorSearch.py + base_parameters: + # TODO modify these arguments to reflect your setup. + uc_catalog: ${var.uc_catalog} + schema: ${var.schema} + preprocessed_data_table: ${var.preprocessed_data_table} + vector_search_endpoint: ${var.vector_search_endpoint} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: data_prep_requirements + + environments: + - environment_key: data_prep_requirements + spec: + client: "3" + dependencies: + - "-r /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target}/files/data_preparation/data_prep_requirements.txt" + + schedule: + quartz_cron_expression: "0 0 5 * * ?" # daily at 5am + timezone_id: UTC + <<: *permissions + # If you want to turn on notifications for this job, please uncomment the below code, + # and provide a list of emails to the on_failure argument. + # + # email_notifications: + # on_failure: + # - first@company.com + # - second@company.com \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/integration/model_serving_test.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/integration/model_serving_test.py.tmpl new file mode 100644 index 00000000..fcd2d13e --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/integration/model_serving_test.py.tmpl @@ -0,0 +1,36 @@ +import pytest +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.serving import ChatMessage, ChatMessageRole + +@pytest.fixture +def workspace_client(): + """Fixture to create a WorkspaceClient instance""" + return WorkspaceClient() + +@pytest.fixture +def test_messages(): + """Fixture with sample test messages""" + return [ChatMessage(content="What is MLflow?", role=ChatMessageRole.USER)] + +def test_endpoint_returns_response(workspace_client, test_messages): + #TODO Replace with your actual endpoint name + ENDPOINT_NAME = "endpoint_name" + + response = workspace_client.serving_endpoints.query( + name=ENDPOINT_NAME, + messages=test_messages, + temperature=1.0, + stream=False, + ) + + # Basic assertions to verify response structure + assert hasattr(response, 'choices'), "Response missing 'choices' field" + assert len(response.choices) > 0, "No choices in response" + + first_choice = response.choices[0] + assert hasattr(first_choice, 'message'), "Choice missing 'message' field" + assert hasattr(first_choice.message, 'content'), "Message missing 'content' field" + + # Verify content is not empty + assert isinstance(first_choice.message.content, str), "Content is not a string" + assert len(first_choice.message.content.strip()) > 0, "Content is empty" From 1a88610b280a3666bb770c201e45790cc2ad07e5 Mon Sep 17 00:00:00 2001 From: Alex Baur Date: Wed, 16 Jul 2025 16:05:22 -0700 Subject: [PATCH 2/5] Add files via upload --- .../docs/agentops-setup.md.tmpl | 397 ++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 template/{{.input_root_dir}}/docs/agentops-setup.md.tmpl diff --git a/template/{{.input_root_dir}}/docs/agentops-setup.md.tmpl b/template/{{.input_root_dir}}/docs/agentops-setup.md.tmpl new file mode 100644 index 00000000..7909670b --- /dev/null +++ b/template/{{.input_root_dir}}/docs/agentops-setup.md.tmpl @@ -0,0 +1,397 @@ +# AgentOps Setup Guide +[(back to main README)](../README.md) + +## Table of contents +* [Intro](#intro) +* [Create a hosted Git repo](#create-a-hosted-git-repo) +* [Configure CI/CD]({{ if (eq .input_cicd_platform `github_actions`) }}#configure-cicd---github-actions{{ else if (eq .input_cicd_platform `azure_devops`) }}#configure-cicd---azure-devops{{ else if (eq .input_cicd_platform `gitlab`) }}#configure-cicd---gitlab{{ end }}) +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} +* [Merge PR with initial ML code](#merge-a-pr-with-your-initial-ml-code) +{{- end }} +{{ if not (eq .input_release_branch .input_default_branch) -}} +* [Create release branch](#create-release-branch) +{{ end -}} +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} +* [Deploy ML resources and enable production jobs](#deploy-ml-resources-and-enable-production-jobs){{ end }} +* [Next steps](#next-steps) + +## Intro +This page explains how to productionize the current project, setting up CI/CD and agent resource deployment, and deploying agent development and deployment. + +After following this guide, data scientists can follow the [Pull Request](pull-request.md) guide to make changes to agent code or deployed jobs. + +## Create a hosted Git repo +Create a hosted Git repo to store project code, if you haven't already done so. From within the project +directory, initialize Git and add your hosted Git repo as a remote: +``` +git init --initial-branch={{ .input_default_branch }} +``` + +``` +git remote add upstream +``` + +Commit the current `README.md` file and other docs to the `{{ .input_default_branch }}` branch of the repo, to enable forking the repo: +``` +{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} +git add README.md docs .gitignore {{template `project_name_alphanumeric_underscore` .}}/resources/README.md +git commit -m "Adding project README" +{{ else }} +git add . +git commit -m "Adding CICD scaffolding" +{{ end }} +git push upstream {{ .input_default_branch }} +``` + +{{ if (eq .input_cicd_platform `github_actions`) -}} +## Configure CI/CD - GitHub Actions + +### Prerequisites +* You must be an account admin to add service principals to the account. +* You must be a Databricks workspace admin in the staging and prod workspaces. + Verify that you're an admin by viewing the + [staging workspace admin console]({{template `databricks_staging_workspace_host` .}}#setting/accounts) and + [prod workspace admin console]({{template `databricks_prod_workspace_host` .}}#setting/accounts). + If the admin console UI loads instead of the Databricks workspace homepage, you are an admin. + +### Set up authentication for CI/CD +#### Set up Service Principal +{{ if eq .input_cloud `azure` }} +To authenticate and manage agent resources created by CI/CD, +[service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) +for the project should be created and added to both staging and prod workspaces. Follow +[Add a service principal to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#--add-a-service-principal-to-your-azure-databricks-account")) }}) +and [Add a service principal to a workspace]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#--add-a-service-principal-to-a-workspace")) }}) +for details. + +For your convenience, we also have Terraform modules that can be used to [create](https://registry.terraform.io/modules/databricks/mlops-azure-project-with-sp-creation/databricks/latest) or [link](https://registry.terraform.io/modules/databricks/mlops-azure-project-with-sp-linking/databricks/latest) service principals. + +{{ else }} +To authenticate and manage agent resources created by CI/CD, +[service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) +for the project should be created and added to both staging and prod workspaces. Follow +[Add a service principal to your Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account")) }}) +and [Add a service principal to a workspace]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace")) }}) +for details. + +{{ if eq .input_cloud `aws` }} +For your convenience, we also have a [Terraform module](https://registry.terraform.io/modules/databricks/mlops-aws-project/databricks/latest) that can set up your service principals. +{{ end }} +{{ end }} + +#### Configure Service Principal (SP) permissions +When you initialize the stack, we set the catalog name in the `databricks.yml`, so we expect a catalog of the same name in each environment. I + +If you want to use different catalog names, please set `uc_catalog` differently under each target environment: + +``` +targets: + dev: + variables: + uc_catalog: + description: Unity Catalog used to store data and artifacts. + default: + +``` + +The SP must have proper permission in each respective environment and the catalog for the environments. + +For the integration tests and workflows, the SP must have permissions to read + write to the specified schema and create experiment and models. +i.e. for each environment: +- USE_CATALOG +- USE_SCHEMA +- MODIFY +- CREATE_MODEL +- CREATE_TABLE +- CREATE_VOLUME + +#### Set secrets for CI/CD +{{ if eq .input_cloud `azure` }} +After creating the service principals and adding them to the respective staging and prod workspaces, refer to +[Manage access tokens for a service principal]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#--manage-access-tokens-for-a-service-principal")) }}) +and [Get Azure AD tokens for service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/api/latest/aad/service-prin-aad-token")) }}) +to get your service principal credentials (tenant id, application id, and client secret) for both the staging and prod service principals, and [Encrypted secrets](https://docs.github.com/en/actions/security-guides/encrypted-secrets) +to add the following secrets to GitHub: +- `PROD_AZURE_SP_TENANT_ID` +- `PROD_AZURE_SP_APPLICATION_ID` +- `PROD_AZURE_SP_CLIENT_SECRET` +- `STAGING_AZURE_SP_TENANT_ID` +- `STAGING_AZURE_SP_APPLICATION_ID` +- `STAGING_AZURE_SP_CLIENT_SECRET` +- `WORKFLOW_TOKEN` : [Github token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic) with workflow permissions. This secret is needed for the Deploy CI/CD Workflow. +Be sure to update the [Workflow Permissions](https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token) section under Repo Settings > Actions > General to allow `Read and write permissions`. +{{ else }} +After creating the service principals and adding them to the respective staging and prod workspaces, follow +[Manage access tokens for a service principal]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#manage-access-tokens-for-a-service-principal")) }}) +to get service principal tokens for staging and prod workspace and follow [Encrypted secrets](https://docs.github.com/en/actions/security-guides/encrypted-secrets) +to add the secrets to GitHub: +- `STAGING_WORKSPACE_TOKEN` : service principal token for staging workspace +- `PROD_WORKSPACE_TOKEN` : service principal token for prod workspace +- `WORKFLOW_TOKEN` : [Github token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic) with workflow permissions. This secret is needed for the Deploy CI/CD Workflow. + +Next, be sure to update the [Workflow Permissions](https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token) section under Repo Settings > Actions > General: +- Allow `Read and write permissions`, +- Allow workflows to be able to open pull requests (PRs). +{{ end }} + +### Setting up CI/CD workflows +After setting up authentication for CI/CD, you can now set up CI/CD workflows. We provide a [Deploy CICD workflow](../.github/workflows/deploy-cicd.yml) that can be used to generate the other CICD workflows mentioned below for projects. +This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy resources and run jobs in the staging and prod workspaces. +These workflows will be defined under `.github/workflows`. + +If you want to deploy CI/CD for an initialized project (`Project-Only` AgentOps Stacks initialization), you can manually run the `deploy-cicd.yml` workflow from the [Github Actions UI](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow?tool=webui) once the project code has been added to your main repo. The workflow will create a pull request with all the changes against your {{ .input_default_branch }} branch. Review and approve it to commit the files to deploy CI/CD for the project. + +{{ else if (eq .input_cicd_platform `azure_devops`) -}} +## Configure CI/CD - Azure DevOps + +Azure DevOps Pipelines are defined under `.azure/devops-pipelines`: +- **`deploy-cicd.yml`**:
+ - Generates the other CICD pipelines mentioned below for projects
+ - Manually triggered with `project_name` as parameter +> Note that this workflow will need to be triggered for each project to set up its CI/CD. In order to run the `Push CICD Bundle to a New Branch` step in the workflow, the project needs to enable the Build Service to be able to contribute and create a branch for the project, +i.e when the deploy CI/CD pipeline is triggered, the build service that runs this pipeline needs the necessary permissions to be able to push. To do this, go to Project Settings -> Repositories -> Security -> Select Build Service under users and +set "Contribute", "Create Branch", and "Contribute to pull requests" to "Allow". + +Project-Specific pipelines: +- **`{{ .input_project_name }}-tests-ci.yml`**:
+ - **[CI]** Performs unit and integration tests
+ - Triggered on PR to main +- **`{{ .input_project_name }}-bundle-cicd.yml`**:
+ - **[CI]** Performs validation of Databricks resources defined under `{{template `project_name_alphanumeric_underscore` .}}/resources`
+ - Triggered on PR to main
+ - **[CD]** Deploys Databricks resources to the staging workspace
+ - Triggered on merging into main
+ - **[CD]** Deploys Databricks resources to the prod workspace
+ - Triggered on merging into release +> Note that these workflows are provided as example CI/CD workflows, and can be easily modified to match your preferred CI/CD order of operations. + +Within the CI/CD pipelines defined under `.azure/devops-pipelines`, we will be deploying Databricks resources to the defined staging and prod workspaces using the `databricks` CLI. This requires setting up authentication between the `databricks` CLI and Databricks. By default we show how to authenticate with service principals by passing [secret variables from a variable group](https://learn.microsoft.com/en-us/azure/devops/pipelines/scripts/cli/pipeline-variable-group-secret-nonsecret-variables?view=azure-devops). In a production setting it is recommended to either use an [Azure Key Vault](https://learn.microsoft.com/en-us/azure/devops/pipelines/release/azure-key-vault?view=azure-devops&tabs=yaml) to store these secrets, or alternatively use [Azure service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml). We describe below how you can adapt the project Pipelines to leverage service connections. Let's add these. + +``` +git add .azure +git commit -m "Adding devops-pipeline files" +git push upstream {{ .input_default_branch }} +``` + +### Service principal approach [Default] + +By default, we provide Azure Pipelines where authentication is done using service principals. + +#### Requirements: +- You must be an account admin to add service principals to the account. +- You must be a Databricks workspace admin in the staging and prod workspaces. Verify that you're an admin by viewing the + [staging workspace admin console]({{template `databricks_staging_workspace_host` .}}#setting/accounts) and + [prod workspace admin console]({{template `databricks_prod_workspace_host` .}}#setting/accounts). If + the admin console UI loads instead of the Databricks workspace homepage, you are an admin. +- Permissions to create Azure DevOps Pipelines in your Azure DevOps project. See the following [Azure DevOps prerequisites](https://learn.microsoft.com/en-us/azure/devops/organizations/security/about-permissions). +- Permissions to create Azure DevOps build policies. See the following [prerequisites](https://learn.microsoft.com/azure/devops/repos/git/branch-policies). + +#### Steps: +{{ if (eq .input_cloud `azure`) }} +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. +1. [Add the staging and production service principals to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details. +1. Follow ['Get Azure AD tokens for the service principals']({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/api/latest/aad/service-prin-aad-token")) }}) +to get your service principal credentials (tenant id, application id, and client secret) for both the staging and prod service principals. You will use these credentials as variables in the project Azure Pipelines. +{{ else }} +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details on how to create a service principal. +1. [Add the staging and production service principals to your Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details. +1. Follow ['Get tokens for the service principals']({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#manage-personal-access-tokens-for-a-service-principal")) }}) +to get your service principal token for both the staging and prod service principals. You will use the token as variables in the project Azure Pipelines. +{{ end }} +1. Create separate Azure Pipelines under your Azure DevOps project using the ‘Existing Azure Pipelines YAML file’ option. Create one pipeline for each script. See [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/create-first-pipeline) for more details on creating Azure Pipelines. +1. Create a new variable group called `{{ .input_root_dir }} variable group` defining the following secret variables, for more details [here](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=classic#create-a-variable-group): +{{ if (eq .input_cloud `azure`) }} + - `PROD_AZURE_SP_TENANT_ID`: tenant ID for the prod service principal + - `PROD_AZURE_SP_APPLICATION_ID`: application (client) ID for the prod service principal + - `PROD_AZURE_SP_CLIENT_SECRET`: client secret for the prod service principal + - `STAGING_AZURE_SP_TENANT_ID`: tenant ID for the staging service principal + - `STAGING_AZURE_SP_APPLICATION_ID`: application (client) ID for the staging service principal + - `STAGING_AZURE_SP_CLIENT_SECRET`: client secret for the prod service principal +{{ else }} + - `PROD_WORKSPACE_TOKEN` : service principal token for prod workspace + - `STAGING_WORKSPACE_TOKEN` : service principal token for staging workspace +{{ end }} + - Ensure that the Azure Pipelines created in the prior step have access to these variables by selecting the name of the pipelines under the 'Pipeline permissions' tab of this variable group. + - Alternatively you could store these secrets in an [Azure Key Vault](https://learn.microsoft.com/en-us/azure/devops/pipelines/release/key-vault-in-own-project?view=azure-devops&tabs=portal) and link those secrets as variables to be used in the Pipelines. +1. Define [build validation branch policies](https://learn.microsoft.com/en-us/azure/devops/repos/git/branch-policies?view=azure-devops&tabs=browser#build-validation) for the `{{ .input_default_branch }}` branch using the Azure build pipelines created in step 1. This is required so that any PR changes to the `{{ .input_default_branch }}` must build successfully before PRs can complete. +In the case of a monorepo, where there are multiple projects under a single repository, set a [path filter](https://learn.microsoft.com/en-us/azure/devops/repos/git/branch-policies?view=azure-devops&tabs=browser#path-filters) on the build validation policies, such that devops pipelines are only triggered when there are changes to the respective projects (e.g. the path filter would be `/project1/*` to trigger a devops pipeline when changes are made to _only_ files under the `project1` folder). + +{{ if (eq .input_cloud `azure`) }} +### Service connection approach [Recommended in production settings] + +#### Requirements: +- You must be an Azure account admin to add service principals to the account. +- You must be a Databricks workspace admin in the staging and prod workspaces. Verify that you're an admin by viewing the + [staging workspace admin console]({{template `databricks_staging_workspace_host` .}}#setting/accounts) and + [prod workspace admin console]({{template `databricks_prod_workspace_host` .}}#setting/accounts). If + the admin console UI loads instead of the Databricks workspace homepage, you are an admin. +- Permissions to create service connections within an Azure subscription. See the following [prerequisites](https://docs.microsoft.com/azure/devops/pipelines/library/service-endpoints). +- Permissions to create Azure DevOps Pipelines in your Azure DevOps project. See the following [Azure DevOps prerequisites](https://learn.microsoft.com/en-us/azure/devops/organizations/security/about-permissions). +- Permissions to create Azure DevOps build policies. See the following [prerequisites](https://learn.microsoft.com/azure/devops/repos/git/branch-policies). + +The ultimate aim of the service connection approach is to use two separate service connections, authenticated with a staging service principal and a production service principal, to deploy and run resources in the respective Azure Databricks workspaces. Taking this approach then negates the need to read client secrets or client IDs from the CI/CD pipelines. + +#### Steps: +1. Create two service principals - one to be used for deploying and running staging resources, and one to be used for deploying and running production resources. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) for details on how to create a service principal. +1. [Add the staging and production service principals to your Azure Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals#add-service-principals-to-your-account-using-the-account-console")) }}), and following this add the staging service principal to the staging workspace, and production service principal to the production workspace. See [here]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html")) }}) for details. +1. [Create two Azure Resource Manager service connections](https://learn.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops&tabs=yaml#create-a-service-connection) - one to be used to deploy to staging Databricks resources, the other for production resources. Each of these service connections should be authenticated with the respective staging and production service principals created in the prior step. +1. Update pipeline YAML files to use service connections rather than pipeline variables: + - First, remove any lines where the environment variables are set in tasks in `{{ .input_project_name }}-tests-ci.yml` or `{{ .input_project_name }}-bundle-cicd.yml` files. Specifically, any lines where the following env vars are used: `PROD_AZURE_SP_TENANT_ID`, `PROD_AZURE_SP_APPLICATION_ID`, `PROD_AZURE_SP_CLIENT_SECRET`, `STAGING_AZURE_SP_TENANT_ID`, `STAGING_AZURE_SP_APPLICATION_ID`, `STAGING_AZURE_SP_CLIENT_SECRET` + - Then, add the following AzureCLI task prior to installing the `databricks` cli in any of the pipeline jobs: + +```yaml +# Get Azure Resource Manager variables using service connection +- task: AzureCLI@2 + displayName: 'Extract information from Azure CLI' + inputs: + azureSubscription: # TODO: insert SERVICE_CONNECTION_NAME + addSpnToEnvironment: true + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + subscription_id=$(az account list --query "[?isDefault].id"|jq -r '.[0]') + echo "##vso[task.setvariable variable=ARM_CLIENT_ID]${servicePrincipalId}" + echo "##vso[task.setvariable variable=ARM_CLIENT_SECRET;issecret=true]${servicePrincipalKey}" + echo "##vso[task.setvariable variable=ARM_TENANT_ID]${tenantId}" + echo "##vso[task.setvariable variable=ARM_SUBSCRIPTION_ID]${subscription_id}" +``` + > Note that you will have to update this code snippet with the respective service connection names, depending on which Databricks workspace you are deploying resources to. + +1. Create separate Azure Pipelines under your Azure DevOps project using the ‘Existing Azure Pipelines YAML file’ option. Create one pipeline for each script. See [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/create-first-pipeline) for more details on creating Azure Pipelines. +1. Define [build validation branch policies](https://learn.microsoft.com/en-us/azure/devops/repos/git/branch-policies?view=azure-devops&tabs=browser#build-validation) for the `{{ .input_default_branch }}` branch using the Azure build pipelines created in step 1. This is required so that any PR changes to the `{{ .input_default_branch }}` must build successfully before PRs can complete. +In the case of a monorepo, where there are multiple projects under a single repository, set a [path filter](https://learn.microsoft.com/en-us/azure/devops/repos/git/branch-policies?view=azure-devops&tabs=browser#path-filters) on the build validation policies, such that devops pipelines are only triggered when there are changes to the respective projects (e.g. the path filter would be `/project1/*` to trigger a devops pipeline when changes are made to _only_ files under the `project1` folder). +{{ end }} + +### Setting up CI/CD workflows +After setting up authentication for CI/CD, you can now set up CI/CD workflows. We provide a [Deploy CICD workflow](../.azure/devops-pipelines/deploy-cicd.yml) that can be used to generate the other CICD workflows mentioned below for projects. +This workflow is manually triggered with `project_name` as parameter. This workflow will need to be triggered for each project to set up its set of CI/CD workflows that can be used to deploy ML resources and run ML jobs in the staging and prod workspaces. +These workflows will be defined under `.azure/devops-pipelines`. After generating these workflows, be sure to go through the above workflow-specific steps again to add the appropriate build branch policies and filters. + +{{ else if (eq .input_cicd_platform `gitlab`) }} +## Configure CI/CD - Gitlab Pipelines + +### Prerequisites +* You must be an account admin to add service principals to the account. +* You must be a Databricks workspace admin in the staging and prod workspaces. Verify that you're an admin by viewing the + [staging workspace admin console]({{template `databricks_staging_workspace_host` .}}#setting/accounts) and + [prod workspace admin console]({{template `databricks_prod_workspace_host` .}}#setting/accounts). If + the admin console UI loads instead of the Databricks workspace homepage, you are an admin. + +### Set up authentication for CI/CD +#### Set up Service Principal +To authenticate and manage resources created by CI/CD, +[service principals]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals")) }}) +should be created and added to test, staging and prod workspaces. + +Service principals can be created and managed in the your cloud provider identity solution or in Databricks directly. We normally recommend setting up a Databricks managed service principal. +Follow [Add a service principal to your Databricks account]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account")) }}) +and [Add a service principal to a workspace]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace")) }}) +for details. + +{{ if eq .input_cloud `aws` }} +For your convenience, we also have a [Terraform module](https://registry.terraform.io/modules/databricks/mlops-aws-project/databricks/latest) that can set up your service principals. +{{ end }} + +#### Configure Service Principal (SP) permissions +If the created project uses **Unity Catalog**, we expect a catalog to exist with the name of the deployment target by default. +For example, if the deployment target is dev, we expect a catalog named dev to exist in the workspace. +If you want to use different catalog names, please update the target names declared in the +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[{{ .input_project_name }}/databricks.yml](../{{template `project_name_alphanumeric_underscore` .}}/databricks.yml) +{{- else }} `databricks.yml` {{ end }} file. +If changing the staging, prod, or test deployment targets, you'll also need to update the workflows located in the .gitlab/pipelines directory. + +The SP must have proper permission in each respective environment and the catalog for the environments. + +For the integration tests and workflows, the SP must have permissions to read + write to the specified schema and create experiment and models. +i.e. for each environment: +- USE_CATALOG +- USE_SCHEMA +- MODIFY +- CREATE_MODEL +- CREATE_TABLE +- CREATE_VOLUME + + +#### Gitlab Environment and secrets for CI/CD +After creating the service principals and adding them to the respective staging and prod workspaces, +you need to ad the client id and secret to Gitlab so that it can authenticate into Databricks for the execution of the integration tests and deployments. +First you need to setup the [Gitlab environments](https://docs.gitlab.com/ee/ci/environments/). Typically you setup the following environemnts: +- integration +- stage +- production + +Add the following [Gitlab CI/CD variables](https://docs.gitlab.com/ee/ci/variables/) In each [environemt](https://docs.gitlab.com/ee/ci/environments/index.html#limit-the-environment-scope-of-a-cicd-variable), with the corresponding service principal information: +- SP_CLIENT_ID +- SP_CLIENT_SECRET + +Ensure that the variable visibility is set to masked and hidden. + +### Building and Pushing the Docker image +Gitlab exectues the pipeline on a VM initialized from a Docker image. +The default Docker image is: [databricksfieldeng/mlopsstack:latest](https://hub.docker.com/repository/docker/databricksfieldeng/mlopsstack/general). +The Docker image should include all requirements to run the CI/CD pipelines (e.g. Databricks CLI, Python and its libraries used in the unit tests, Java for executing spark locally to the VM). +The folder `.gitlab/docker/` includes the files to build the Docker image. + +You can execute the following script to locally build and push the Docker image into a registry on your local machine: +`{{.input_root_dir}}/.gitlab/docker/push_image_to_gitlab.sh` +Make sure that each pipeline yml file +(in the folder `.gitlab/pipelines/`) points to that image. +NOTE: you can use the same image for multiple projects. +If a project requires additional Python libraries, they can also be installed as part of the pipelines scripts. + +### Setting up the CI/CD Pipeline +Gitlab, by default, expects the pipeline file to be placed in the project root folder and to be named `.gitlab-ci.yml`. +Change the pipeline default [pipeline configuration file](https://docs.gitlab.com/ee/ci/pipelines/settings.html#specify-a-custom-cicd-configuration-file): +in the section `Settings > CI/CD > General Pipelines > CI/CD configuration file`, +add the value `.gitlab/pipelines/{{.input_project_name}}-triggers-cicd.yml` which is our project main pipeline file. + +{{ end }} + +{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} + +## Merge a PR with your initial agent code +Create and push a PR branch adding the agent code to the repository. + +``` +git checkout -b add-agent-code +git add . +git commit -m "Add agent code" +git push upstream add-agent-code +``` + +Open a PR from the newly pushed branch. CI will run to ensure that tests pass +on your initial agent code. Fix tests if needed, then get your PR reviewed and merged. +After the pull request merges, pull the changes back into your local `{{ .input_default_branch }}` +branch: + +``` +git checkout {{ .input_default_branch }} +git pull upstream {{ .input_default_branch }} +``` +{{- end }} + +{{ if not (eq .input_release_branch .input_default_branch) -}} +## Create release branch +Create and push a release branch called `{{ .input_release_branch }}` off of the `{{ .input_default_branch }}` branch of the repository: +``` +git checkout -b {{ .input_release_branch }} {{ .input_default_branch }} +git push upstream {{ .input_release_branch }} +git checkout {{ .input_default_branch }} +``` + +Your production jobs will pull the agent code against this branch, while your staging jobs will pull the agent code against the `{{ .input_default_branch }}` branch. Note that the `{{ .input_default_branch }}` branch will be the source of truth for agent resource configs and CI/CD workflows. + +For future code changes, iterate against the `{{ .input_default_branch }}` branch and regularly deploy your code from staging to production by merging code changes from the `{{ .input_default_branch }}` branch into the `{{ .input_release_branch }}` branch. +{{ end -}} + +{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} +## Deploy agent resources and enable production jobs +Follow the instructions in [{{ .input_project_name }}/resources/README.md](../{{template `project_name_alphanumeric_underscore` .}}/resources/README.md) to deploy agent resources and production jobs. +{{- end }} + +## Next steps +After you configure CI/CD and deploy training & inference pipelines, notify data scientists working +on the current project. They should now be able to follow the +[pull request guide](pull-request.md) and +{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`)}}[Agent resource config guide](../{{template `project_name_alphanumeric_underscore` .}}/resources/README.md){{- end }} to propose, test, and deploy +Agent code and pipeline changes to production. \ No newline at end of file From d77ac6295af295801bd6889ad9ddd073d7f43480 Mon Sep 17 00:00:00 2001 From: Alex Baur Date: Wed, 16 Jul 2025 16:10:29 -0700 Subject: [PATCH 3/5] Update update_layout.tmpl Updated layout to skip agentops files. --- template/update_layout.tmpl | 67 +++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index a666e23c..2eccc23c 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -2,6 +2,73 @@ {{ $project_name_alphanumeric_underscore := (regexp `-`).ReplaceAllString ((regexp `[^A-Za-z0-9_-]`).ReplaceAllString .input_project_name ``) `_` -}} {{ $root_dir := .input_root_dir}} +# Skip all Agentops files in the template +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion/fetch_data.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks/DataIngestion.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/notebooks/DataPreprocessing.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing/create_chunk.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/notebooks/VectorSearch.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/utils.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_requirements.txt`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/notebooks/Agent.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools/ai_tools.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/notebooks/AgentEvaluation.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation/evaluation.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/LaunchApp.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/README.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/requirements.txt`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/utils.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/notebooks`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/notebooks/ModelServing.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving/serving.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving/__init__.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration/model_serving_test.py`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/agent-resource.yml`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/agent-artifacts-resource.yml`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/app-deployment-resource.yml`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/data-preparation-resource.yml`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/README_agentops.md`) }} + {{ if (eq .input_setup_cicd_and_project `Project_Only`) }} {{ skip (printf `%s/%s` $root_dir `.azure`) }} {{ skip (printf `%s/%s` $root_dir `.github`) }} From 01cae35af0e591dc5ab72a5b818ab9915655684c Mon Sep 17 00:00:00 2001 From: Alex Baur Date: Wed, 16 Jul 2025 16:14:03 -0700 Subject: [PATCH 4/5] Update update_layout.tmpl added additional skips for agentops files --- template/update_layout.tmpl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index 2eccc23c..43c7fc01 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -68,6 +68,8 @@ {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/app-deployment-resource.yml`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/data-preparation-resource.yml`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/README_agentops.md`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `databricks_agentops.yml`) }} +{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `README_agentops.md`) }} {{ if (eq .input_setup_cicd_and_project `Project_Only`) }} {{ skip (printf `%s/%s` $root_dir `.azure`) }} From 68c34ef7dd98aee98c8fcd790231cc83b036498b Mon Sep 17 00:00:00 2001 From: Alex Baur Date: Mon, 21 Jul 2025 15:54:03 -0700 Subject: [PATCH 5/5] Simplified skips for directories. --- template/update_layout.tmpl | 55 ------------------------------------- 1 file changed, 55 deletions(-) diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index 43c7fc01..8ae98d48 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -4,63 +4,8 @@ # Skip all Agentops files in the template {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion/fetch_data.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/ingestion/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_ingestion/notebooks/DataIngestion.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/notebooks/DataPreprocessing.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing/create_chunk.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/data_preprocessing/preprocessing/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/notebooks/VectorSearch.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/utils.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation/vector_search/vector_search_utils/__init__.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_requirements.txt`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/notebooks/Agent.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools/ai_tools.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent/tools/__init__.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/notebooks/AgentEvaluation.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation/evaluation.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_development/agent_evaluation/evaluation/__init__.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/LaunchApp.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/README.md`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/requirements.txt`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/chat_interface_deployment/utils.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/notebooks`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/notebooks/ModelServing.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving/serving.py`) }} -{{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `agent_deployment/model_serving/serving/__init__.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration/model_serving_test.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/agent-resource.yml`) }}