From 0ffe82e4f54d586f1a06734475df096aac101dfb Mon Sep 17 00:00:00 2001 From: ravi-databricks Date: Tue, 26 Aug 2025 17:51:57 -0700 Subject: [PATCH 01/15] Adding new demo's under doc site --- docs/content/demo/Append_FLOW_CF.md | 20 +++- docs/content/demo/Append_FLOW_EH.md | 22 ++++- .../demo/Apply_Changes_From_Snapshot.md | 20 +++- docs/content/demo/DAB.md | 98 +++++++++++++++++++ docs/content/demo/DAIS.md | 20 +++- docs/content/demo/DLT_Sink.md | 74 ++++++++++++++ docs/content/demo/Silver_Fanout.md | 26 +++-- docs/content/demo/Techsummit.md | 22 ++++- docs/content/demo/_index.md | 4 +- 9 files changed, 277 insertions(+), 29 deletions(-) create mode 100644 docs/content/demo/DAB.md create mode 100644 docs/content/demo/DLT_Sink.md diff --git a/docs/content/demo/Append_FLOW_CF.md b/docs/content/demo/Append_FLOW_CF.md index f90b4012..a77396c9 100644 --- a/docs/content/demo/Append_FLOW_CF.md +++ b/docs/content/demo/Append_FLOW_CF.md @@ -21,15 +21,26 @@ This demo will perform following tasks: databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` @@ -38,7 +49,8 @@ This demo will perform following tasks: export PYTHONPATH=$dlt_meta_home ``` -6. ```commandline +7. Run the command: + ```commandline python demo/launch_af_cloudfiles_demo.py --cloud_provider_name=aws --dbr_version=15.3.x-scala2.12 --dbfs_path=dbfs:/tmp/DLT-META/demo/ --uc_catalog_name=dlt_meta_uc ``` diff --git a/docs/content/demo/Append_FLOW_EH.md b/docs/content/demo/Append_FLOW_EH.md index 00ea4712..cbf503ac 100644 --- a/docs/content/demo/Append_FLOW_EH.md +++ b/docs/content/demo/Append_FLOW_EH.md @@ -18,21 +18,32 @@ draft: false databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` ```commandline export PYTHONPATH=$dlt_meta_home ``` -6. Eventhub +7. Configure Eventhub - Needs eventhub instance running - Need two eventhub topics first for main feed (eventhub_name) and second for append flow feed (eventhub_name_append_flow) - Create databricks secrets scope for eventhub keys @@ -61,7 +72,8 @@ draft: false - eventhub_secrets_scope_name: Databricks secret scope name e.g. eventhubs_dltmeta_creds - eventhub_port: Eventhub port -7. ```commandline +8. Run the command: + ```commandline python demo/launch_af_eventhub_demo.py --cloud_provider_name=aws --uc_catalog_name=dlt_meta_uc --eventhub_name=dltmeta_demo --eventhub_name_append_flow=dltmeta_demo_af --eventhub_secrets_scope_name=dltmeta_eventhub_creds --eventhub_namespace=dltmeta --eventhub_port=9093 --eventhub_producer_accesskey_name=RootManageSharedAccessKey --eventhub_consumer_accesskey_name=RootManageSharedAccessKey --eventhub_accesskey_secret_name=RootManageSharedAccessKey ``` diff --git a/docs/content/demo/Apply_Changes_From_Snapshot.md b/docs/content/demo/Apply_Changes_From_Snapshot.md index 8006ec74..ee294276 100644 --- a/docs/content/demo/Apply_Changes_From_Snapshot.md +++ b/docs/content/demo/Apply_Changes_From_Snapshot.md @@ -26,21 +26,33 @@ draft: false databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` ```commandline export PYTHONPATH=$dlt_meta_home -6. ```commandline +7. Run the command: + ```commandline python demo/launch_acfs_demo.py --uc_catalog_name=<> ``` - uc_catalog_name : Unity catalog name diff --git a/docs/content/demo/DAB.md b/docs/content/demo/DAB.md new file mode 100644 index 00000000..f50619c7 --- /dev/null +++ b/docs/content/demo/DAB.md @@ -0,0 +1,98 @@ +--- +title: "DAB Demo" +date: 2024-02-26T14:25:26-04:00 +weight: 28 +draft: false +--- + +### DAB Demo + +## Overview +This demo showcases how to use Databricks Asset Bundles (DABs) with DLT-Meta: + +This demo will perform following steps: +- Create dlt-meta schema's for dataflowspec and bronze/silver layer +- Upload necessary resources to unity catalog volume +- Create DAB files with catalog, schema, file locations populated +- Deploy DAB to databricks workspace +- Run onboarding using DAB commands +- Run Bronze/Silver Pipelines using DAB commands +- Demo examples will showcase fan-out pattern in silver layer +- Demo example will show case custom transformations for bronze/silver layers +- Adding custom columns and metadata to Bronze tables +- Implementing SCD Type 1 to Silver tables +- Applying expectations to filter data in Silver tables + +### Steps: +1. Launch Command Prompt + +2. Install [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) + - Once you install Databricks CLI, authenticate your current machine to a Databricks Workspace: + + ```commandline + databricks auth login --host WORKSPACE_HOST + ``` + +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline + git clone https://github.com/databrickslabs/dlt-meta.git + ``` + +5. Navigate to project directory: + ```commandline + cd dlt-meta + ``` + +6. Set python environment variable into terminal: + ```commandline + dlt_meta_home=$(pwd) + export PYTHONPATH=$dlt_meta_home + ``` + +7. Generate DAB resources and set up schemas: + This command will: + - Generate DAB configuration files + - Create DLT-Meta schemas + - Upload necessary files to volumes + ```commandline + python demo/generate_dabs_resources.py --source=cloudfiles --uc_catalog_name= --profile= + ``` + > Note: If you don't specify `--profile`, you'll be prompted for your Databricks workspace URL and access token. + +8. Deploy and run the DAB bundle: + - Navigate to the DAB directory: + ```commandline + cd demo/dabs + ``` + + - Validate the bundle configuration: + ```commandline + databricks bundle validate --profile= + ``` + + - Deploy the bundle to dev environment: + ```commandline + databricks bundle deploy --target dev --profile= + ``` + + - Run the onboarding job: + ```commandline + databricks bundle run onboard_people -t dev --profile= + ``` + + - Execute the pipelines: + ```commandline + databricks bundle run execute_pipelines_people -t dev --profile= + ``` + +![dab_onboarding_job.png](/images/dab_onboarding_job.png) +![dab_dlt_pipelines.png](/images/dab_dlt_pipelines.png) diff --git a/docs/content/demo/DAIS.md b/docs/content/demo/DAIS.md index 4fd468d9..b1f7ec48 100644 --- a/docs/content/demo/DAIS.md +++ b/docs/content/demo/DAIS.md @@ -23,15 +23,26 @@ This demo showcases DLT-META's capabilities of creating Bronze and Silver DLT pi databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` @@ -39,7 +50,8 @@ This demo showcases DLT-META's capabilities of creating Bronze and Silver DLT pi export PYTHONPATH=$dlt_meta_home ``` -6. ```commandline +7. Run the command: + ```commandline python demo/launch_dais_demo.py --uc_catalog_name=<> --cloud_provider_name=<<>> ``` - uc_catalog_name : unit catalog name diff --git a/docs/content/demo/DLT_Sink.md b/docs/content/demo/DLT_Sink.md new file mode 100644 index 00000000..3e851f37 --- /dev/null +++ b/docs/content/demo/DLT_Sink.md @@ -0,0 +1,74 @@ +--- +title: "Lakeflow Declarative Pipelines Sink Demo" +date: 2024-02-26T14:25:26-04:00 +weight: 27 +draft: false +--- + +### Lakeflow Declarative Pipelines Sink Demo +This demo will perform following steps: +- Showcase onboarding process for dlt writing to external sink pattern +- Run onboarding for the bronze iot events +- Publish test events to kafka topic +- Run Bronze Lakeflow Declarative Pipelines which will read from kafka source topic and write to: + - Events delta table into UC + - Create quarantine table as per data quality expectations + - Writes to external kafka topics + - Writes to external dbfs location as external delta sink + +### Steps: +1. Launch Command Prompt + +2. Install [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) + - Once you install Databricks CLI, authenticate your current machine to a Databricks Workspace: + + ```commandline + databricks auth login --host WORKSPACE_HOST + ``` + +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline + git clone https://github.com/databrickslabs/dlt-meta.git + ``` + +5. Navigate to project directory: + ```commandline + cd dlt-meta + ``` + +6. Set python environment variable into terminal: + ```commandline + dlt_meta_home=$(pwd) + export PYTHONPATH=$dlt_meta_home + ``` + +7. Configure Kafka (Optional): + If you are using secrets for kafka, create databricks secrets scope for source and sink kafka: + ```commandline + databricks secrets create-scope <> + ``` + ```commandline + databricks secrets put-secret --json '{ + "scope": "<>", + "key": "<>", + "string_value": "<>" + }' + ``` + +8. Run the command: + ```commandline + python demo/launch_dlt_sink_demo.py --uc_catalog_name=<> --source=kafka --kafka_source_topic=<> --kafka_sink_topic=<> --kafka_source_servers_secrets_scope_name=<> --kafka_source_servers_secrets_scope_key=<> --kafka_sink_servers_secret_scope_name=<> --kafka_sink_servers_secret_scope_key=<> --profile=<> + ``` + +![dlt_demo_sink.png](/images/dlt_demo_sink.png) +![dlt_delta_sink.png](/images/dlt_delta_sink.png) +![dlt_kafka_sink.png](/images/dlt_kafka_sink.png) diff --git a/docs/content/demo/Silver_Fanout.md b/docs/content/demo/Silver_Fanout.md index 8b57b9cf..6e5cf4eb 100644 --- a/docs/content/demo/Silver_Fanout.md +++ b/docs/content/demo/Silver_Fanout.md @@ -23,31 +23,43 @@ draft: false databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` ```commandline export PYTHONPATH=$dlt_meta_home -6. ```commandline +7. Run the command: + ```commandline python demo/launch_silver_fanout_demo.py --uc_catalog_name=<> --cloud_provider_name=aws ``` - uc_catalog_name : aws or azure - cloud_provider_name : aws or azure - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token. - - - 6a. Databricks Workspace URL: - - - Enter your workspace URL, with the format https://.cloud.databricks.com. To get your workspace URL, see Workspace instance names, URLs, and IDs. + a. Databricks Workspace URL: + Enter your workspace URL, with the format https://.cloud.databricks.com. To get your workspace URL, see Workspace instance names, URLs, and IDs. - - - 6b. Token: + b. Token: - In your Databricks workspace, click your Databricks username in the top bar, and then select User Settings from the drop down. - On the Access tokens tab, click Generate new token. diff --git a/docs/content/demo/Techsummit.md b/docs/content/demo/Techsummit.md index d53b71da..996ae76e 100644 --- a/docs/content/demo/Techsummit.md +++ b/docs/content/demo/Techsummit.md @@ -17,15 +17,26 @@ This demo will launch auto generated tables(100s) inside single bronze and silve databricks auth login --host WORKSPACE_HOST ``` -3. ```commandline +3. Install Python package requirements: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + ``` + +4. Clone dlt-meta: + ```commandline git clone https://github.com/databrickslabs/dlt-meta.git ``` -4. ```commandline +5. Navigate to project directory: + ```commandline cd dlt-meta ``` -5. Set python environment variable into terminal +6. Set python environment variable into terminal ```commandline dlt_meta_home=$(pwd) ``` @@ -33,7 +44,10 @@ This demo will launch auto generated tables(100s) inside single bronze and silve export PYTHONPATH=$dlt_meta_home ``` -6. Run the command ```python demo/launch_techsummit_demo.py --uc_catalog_name=<> --cloud_provider_name=aws ``` +7. Run the command: + ```commandline + python demo/launch_techsummit_demo.py --uc_catalog_name=<> --cloud_provider_name=aws + ``` - uc_catalog_name : Unity Catalog name - cloud_provider_name : aws or azure - you can provide `--profile=databricks_profile name` in case you already have databricks cli otherwise command prompt will ask host and token diff --git a/docs/content/demo/_index.md b/docs/content/demo/_index.md index 52cc5891..ba60b748 100644 --- a/docs/content/demo/_index.md +++ b/docs/content/demo/_index.md @@ -10,4 +10,6 @@ draft: false 3. **Append FLOW Autoloader Demo**: Write to same target from multiple sources using append_flow and adding file metadata using [File metadata column](https://docs.databricks.com/en/ingestion/file-metadata-column.html) 4. **Append FLOW Eventhub Demo**: Write to same target from multiple sources using append_flow and adding using [File metadata column](https://docs.databricks.com/en/ingestion/file-metadata-column.html) 5. **Silver Fanout Demo**: This demo will showcase fanout architecture can be implemented in silver layer - 6. **Apply Changes From Snapshot Demo**: This demo will showcase [create_auto_cdc_from_snapshot_flow](https://docs.databricks.com/aws/en/dlt-ref/dlt-python-ref-apply-changes-from-snapshot) can be implemented inside bronze and silver layer \ No newline at end of file + 6. **Apply Changes From Snapshot Demo**: This demo will showcase [create_auto_cdc_from_snapshot_flow](https://docs.databricks.com/aws/en/dlt-ref/dlt-python-ref-apply-changes-from-snapshot) can be implemented inside bronze and silver layer + 7. **Lakeflow Declarative Pipelines Sink Demo**: This demo showcases the implementation of write to external sinks like delta and kafka + 8. **DAB Demo**: This demo showcases how to use Databricks Assets Bundles with dlt-meta \ No newline at end of file From 4fa0ff56a5416e73ced5723855897c282e0721d5 Mon Sep 17 00:00:00 2001 From: ravi-databricks Date: Wed, 27 Aug 2025 12:43:34 -0700 Subject: [PATCH 02/15] updated docs, readme for: 1.Updating feature table with latest api names 2.Updated docs readme for metadata prep info 3.Updated docs cli readme to add dev requirements --- README.md | 7 +- docs/content/getting_started/dltmeta_cli.md | 64 +++++++++++++++---- .../getting_started/metadatapreperation.md | 3 +- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index bc76b25e..b7d21f34 100755 --- a/README.md +++ b/README.md @@ -53,14 +53,15 @@ In practice, a single generic pipeline reads the Dataflowspec and uses it to orc | Custom transformations | Bronze, Silver layer accepts custom functions| | Data Quality Expecations Support | Bronze, Silver layer | | Quarantine table support | Bronze layer | -| [apply_changes](https://docs.databricks.com/en/delta-live-tables/python-ref.html#cdc) API support | Bronze, Silver layer | -| [apply_changes_from_snapshot](https://docs.databricks.com/en/delta-live-tables/python-ref.html#change-data-capture-from-database-snapshots-with-python-in-delta-live-tables) API support | Bronze layer| +| [create_auto_cdc_flow](https://docs.databricks.com/aws/en/dlt-ref/dlt-python-ref-apply-changes) API support | Bronze, Silver layer | +| [create_auto_cdc_from_snapshot_flow](https://docs.databricks.com/aws/en/dlt-ref/dlt-python-ref-apply-changes-from-snapshot) API support | Bronze layer| | [append_flow](https://docs.databricks.com/en/delta-live-tables/flows.html#use-append-flow-to-write-to-a-streaming-table-from-multiple-source-streams) API support | Bronze layer| | Liquid cluster support | Bronze, Bronze Quarantine, Silver tables| | [DLT-META CLI](https://databrickslabs.github.io/dlt-meta/getting_started/dltmeta_cli/) | ```databricks labs dlt-meta onboard```, ```databricks labs dlt-meta deploy``` | | Bronze and Silver pipeline chaining | Deploy dlt-meta pipeline with ```layer=bronze_silver``` option using Direct publishing mode | -| [DLT Sinks](https://docs.databricks.com/aws/en/delta-live-tables/dlt-sinks) |Supported formats:external ```delta table```, ```kafka```.Bronze, Silver layers| +| [create_sink](https://docs.databricks.com/aws/en/dlt-ref/dlt-python-ref-sink) API support |Supported formats:external ```delta table```, ```kafka```.Bronze, Silver layers| | [Databricks Asset Bundles](https://docs.databricks.com/aws/en/dev-tools/bundles/) | Supported +| [DLT-META UI](https://github.com/databrickslabs/dlt-meta/tree/main/lakehouse_app#dlt-meta-lakehouse-app-setup) | Uses Databricks Lakehouse DLT-META App ## Getting Started diff --git a/docs/content/getting_started/dltmeta_cli.md b/docs/content/getting_started/dltmeta_cli.md index 3aae6329..3344d971 100644 --- a/docs/content/getting_started/dltmeta_cli.md +++ b/docs/content/getting_started/dltmeta_cli.md @@ -5,20 +5,58 @@ weight: 7 draft: false --- -### pre-requisites: -- [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/tutorial.html) -- Once you install Databricks CLI, authenticate your current machine to a Databricks Workspace: - - ```commandline - databricks auth login --host WORKSPACE_HOST - ``` +### Prerequisites: - Python 3.8.0 + -##### Steps: -1. ``` git clone https://github.com/databrickslabs/dlt-meta.git ``` -2. ``` cd dlt-meta ``` -3. ``` python -m venv .venv ``` -4. ```source .venv/bin/activate ``` -5. ``` pip install databricks-sdk ``` +- [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/tutorial.html) + +### Steps: +1. Install and authenticate Databricks CLI: + ```commandline + databricks auth login --host WORKSPACE_HOST + ``` + +2. Install dlt-meta via Databricks CLI: + ```commandline + databricks labs install dlt-meta + ``` + +3. Clone dlt-meta repository: + ```commandline + git clone https://github.com/databrickslabs/dlt-meta.git + ``` + +4. Navigate to project directory: + ```commandline + cd dlt-meta + ``` + +5. Create Python virtual environment: + ```commandline + python -m venv .venv + ``` + +6. Activate virtual environment: + ```commandline + source .venv/bin/activate + ``` + +7. Install required packages: + ```commandline + # Core requirements + pip install "PyYAML>=6.0" setuptools databricks-sdk + + # Development requirements + pip install flake8==6.0 delta-spark==3.0.0 pytest>=7.0.0 coverage>=7.0.0 pyspark==3.5.5 + + # Integration test requirements + pip install "typer[all]==0.6.1" + ``` + +8. Set environment variables: + ```commandline + dlt_meta_home=$(pwd) + export PYTHONPATH=$dlt_meta_home + ``` ![onboardingDLTMeta.gif](/images/onboardingDLTMeta.gif) diff --git a/docs/content/getting_started/metadatapreperation.md b/docs/content/getting_started/metadatapreperation.md index 1ae1efb0..498507d3 100644 --- a/docs/content/getting_started/metadatapreperation.md +++ b/docs/content/getting_started/metadatapreperation.md @@ -64,8 +64,7 @@ The `onboarding.json` file contains links to [silver_transformations.json](https | silver_transformation_json | Silver table sql transformation json path | | silver_data_quality_expectations_json_{env} | Silver table data quality expectations json file path | silver_append_flows | Silver table append flows json. e.g.`"silver_append_flows":[{"name":"customer_bronze_flow", -| silver_apply_changes_from_snapshot | Silver apply changes from snapshot Json e.g. Mandatory fields: keys=["userId"], scd_type=`1` or `2` optional fields: track_history_column_list=`[col1]`, track_history_except_column_list=`[col2]` | -"create_streaming_table": false,"source_format": "cloudFiles", "source_details": {"source_database": "APP","source_table":"CUSTOMERS", "source_path_dev": "tests/resources/data/customers", "source_schema_path": "tests/resources/schema/customer_schema.ddl"},"reader_options": {"cloudFiles.format": "json","cloudFiles.inferColumnTypes": "true","cloudFiles.rescuedDataColumn": "_rescued_data"},"once": true}]`| +| silver_apply_changes_from_snapshot | Silver apply changes from snapshot Json e.g. Mandatory fields: keys=["userId"], scd_type=`1` or `2` optional fields: track_history_column_list=`[col1]`, track_history_except_column_list=`[col2]`| From 62504c2a68f79b5eb3fbd64105fd84a8f17fc1f1 Mon Sep 17 00:00:00 2001 From: "dattatraya.walake" Date: Wed, 27 Aug 2025 16:38:20 -0400 Subject: [PATCH 03/15] added dlt sink demo and silver spec table name ui component --- lakehouse_app/app.py | 56 ++++++++++++++++++++---- lakehouse_app/templates/landingPage.html | 14 ++++-- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/lakehouse_app/app.py b/lakehouse_app/app.py index db2b6989..3af13fa7 100644 --- a/lakehouse_app/app.py +++ b/lakehouse_app/app.py @@ -227,8 +227,8 @@ def start_command(): if 'PYTHONPATH' not in os.environ or not os.path.isdir(os.environ.get('PYTHONPATH', '')): commands = [ "pip install databricks-cli", - # "git clone https://github.com/databrickslabs/dlt-meta.git", - "git clone https://github.com/dattawalake/dlt-meta.git", + "git clone https://github.com/databrickslabs/dlt-meta.git", + #"git clone https://github.com/dattawalake/dlt-meta.git", f"python -m venv {current_directory}/dlt-meta/.venv", f"export HOME={current_directory}", "cd dlt-meta", @@ -236,6 +236,7 @@ def start_command(): f"export PYTHONPATH={current_directory}/dlt-meta/", "pwd", "pip install databricks-sdk", + "pip install PyYAML", ] print("Start setting up dlt-meta environment") for c in commands: @@ -322,6 +323,7 @@ def handle_onboard_form(): "silver_schema": request.form.get('silver_schema', 'dltmeta_silver_7b4e981029b843c799bf61a0a121b3ca'), "dlt_meta_layer": request.form.get('dlt_meta_layer', '1'), "bronze_table": request.form.get('bronze_table', 'bronze_dataflowspec'), + "silver_table": request.form.get('silver_table', 'silver_dataflowspec'), "overwrite": "1" if request.form.get('overwrite') == "1" else "0", "version": request.form.get('version', 'v1'), "environment": request.form.get('environment', 'prod'), @@ -379,22 +381,58 @@ def run_demo(): demo_dict = {"demo_cloudfiles": "demo/launch_af_cloudfiles_demo.py", "demo_acf": "demo/launch_acfs_demo.py", "demo_silverfanout": "demo/launch_silver_fanout_demo.py", - "demo_dias": "demo/launch_dais_demo.py" + "demo_dias": "demo/launch_dais_demo.py", + "demo_dlt_sink": "demo/launch_dlt_sink_demo.py", + "demo_dabs": "demo/generate_dabs_resources.py" } demo_file = demo_dict.get(code_to_run, None) uc_name = request.json.get('uc_name', '') - result = subprocess.run(f"python {current_directory}/{demo_file} --uc_catalog_name {uc_name} --profile DEFAULT", - shell=True, - capture_output=True, - text=True - ) + + if code_to_run == 'demo_dabs': + + # Step 1: Generate Databricks resources + subprocess.run(f"python {current_directory}/{demo_file} --uc_catalog_name {uc_name} --source=cloudfiles --profile DEFAULT", + shell=True, + capture_output=True, + text=True + ) + + # Step 2: Change working directory to demo/dabs for all next commands + dabs_dir = os.path.join("demo", "dabs") + subprocess.run("databricks bundle validate --profile=DEFAULT", cwd=f"{current_directory}/demo/dabs",shell=True, + capture_output=True, + text=True) + + # Step 4: Deploy the bundle + subprocess.run("databricks bundle deploy --target dev --profile=DEFAULT", cwd=f"{current_directory}/demo/dabs", shell=True, + capture_output=True, + text=True) + + # Step 5: Run 'onboard_people' task + rs1 = subprocess.run("databricks bundle run onboard_people -t dev --profile=DEFAULT", cwd=f"{current_directory}/demo/dabs", shell=True, + capture_output=True, + text=True) + print(f"onboarding completed: {rs1.stdout}") + # Step 6: Run 'execute_pipelines_people' task + result = subprocess.run("databricks bundle run execute_pipelines_people -t dev --profile=DEFAULT", cwd=f"{current_directory}/demo/dabs", + shell=True, + capture_output=True, + text=True + ) + print(f"execution of pipeline completed: {result.stdout}") + else: + result = subprocess.run(f"python {current_directory}/{demo_file} --uc_catalog_name {uc_name} --profile DEFAULT", + shell=True, + capture_output=True, + text=True + ) return extract_command_output(result) def extract_command_output(result): stdout = result.stdout job_id_match = re.search(r"job_id=(\d+) | pipeline=(\d+)", stdout) - url_match = re.search(r"url=(https?://[^\s]+)", stdout) + url_match = re.search(r"(https?://[^\s]+)", stdout) #re.search(r"url=(https?://[^\s]+)", stdout) job_id = job_id_match.group(1) or job_id_match.group(2) if job_id_match else None job_url = url_match.group(1) if url_match else None diff --git a/lakehouse_app/templates/landingPage.html b/lakehouse_app/templates/landingPage.html index 75e865a4..2f96ca06 100644 --- a/lakehouse_app/templates/landingPage.html +++ b/lakehouse_app/templates/landingPage.html @@ -655,6 +655,11 @@

Step 1 : Onboarding

+
+ + +
@@ -841,6 +846,8 @@

Available Demos

+ +
@@ -956,7 +963,7 @@ const modalContent = `