Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support writing to Databricks delta table using a module #1483

Open
wants to merge 4 commits into
base: branch-24.06
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion morpheus/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from morpheus.modules import to_control_message
from morpheus.modules import write_to_elasticsearch
from morpheus.modules import write_to_file
from morpheus.modules import databricks_deltalake_writer

__all__ = [
"file_batcher",
Expand All @@ -43,5 +44,6 @@
"serialize",
"to_control_message",
"write_to_file",
"write_to_elasticsearch"
"write_to_elasticsearch",
"databricks_deltalake_writer"
]
113 changes: 113 additions & 0 deletions morpheus/modules/databricks_deltalake_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2024, NVIDIA CORPORATION.

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging

import mrc
import pandas as pd
from mrc.core import operators as ops

import cudf

from morpheus.utils.module_utils import register_module
from morpheus.config import Config
from morpheus.messages import MessageMeta
from morpheus.utils.module_ids import DATABRICKS_DELTALAKE_WRITER
from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE

logger = logging.getLogger(__name__)

IMPORT_EXCEPTION = None
IMPORT_ERROR_MESSAGE = "DataBricks writer module requires the databricks-connect package to be installed."

try:
from databricks.connect import DatabricksSession
from pyspark.sql import types as sql_types
except ImportError as import_exc:
IMPORT_EXCEPTION = import_exc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 'IMPORT_EXCEPTION' used anywhere? If we're not handling the failure to import via some kind of work around, then lets log an error here and re-raise the ImportErrror


def _extract_schema_from_pandas_dataframe(df: pd.DataFrame) -> "sql_types.StructType":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets add at least one unit test for this helper function; mocking out everything to deal with the spark internals in the module's work unit probably isn't worth it right now, but having a test file gives us a spot for sprouting new tests later.

"""
Extract approximate schemas from pandas dataframe
"""
spark_schema = []
for col, dtype in df.dtypes.items():
try:
if dtype == "bool":
spark_dtype = sql_types.StructField(col, sql_types.BooleanType())
elif dtype == "int64":
spark_dtype = sql_types.StructField(col, sql_types.LongType())
elif dtype == "int32":
spark_dtype = sql_types.StructField(col, sql_types.IntegerType())
elif dtype == "float64":
spark_dtype = sql_types.StructField(col, sql_types.DoubleType())
elif dtype == "float32":
spark_dtype = sql_types.StructField(col, sql_types.FloatType())
elif dtype == "datetime64[ns]":
spark_dtype = sql_types.StructField(col, sql_types.TimestampType())
else:
spark_dtype = sql_types.StructField(col, sql_types.StringType())
except Exception as e:
logger.error("Encountered error %s while converting columns %s with data type %s", e, col, dtype)
spark_dtype = sql_types.StructField(col, sql_types.StringType())
spark_schema.append(spark_dtype)
return sql_types.StructType(spark_schema)

@register_module(DATABRICKS_DELTALAKE_WRITER, MORPHEUS_MODULE_NAMESPACE)
def databricks_deltalake_writer(builder: mrc.Builder):
module_config = builder.get_current_module_config()
"""module_config contains all the required configuration parameters, that would otherwise be passed to the stage.
Parameters
----------
config : morpheus.config.Config
Pipeline configuration instance.
delta_path : str, default None
Path of the delta table where the data need to be written or updated.
databricks_host : str, default None
URL of Databricks host to connect to.
databricks_token : str, default None
Access token for Databricks cluster.
databricks_cluster_id : str, default None
Databricks cluster to be used to query the data as per SQL provided.
delta_table_write_mode: str, default "append"
Delta table write mode for storing data.
"""
Comment on lines +69 to +85
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
module_config = builder.get_current_module_config()
"""module_config contains all the required configuration parameters, that would otherwise be passed to the stage.
Parameters
----------
config : morpheus.config.Config
Pipeline configuration instance.
delta_path : str, default None
Path of the delta table where the data need to be written or updated.
databricks_host : str, default None
URL of Databricks host to connect to.
databricks_token : str, default None
Access token for Databricks cluster.
databricks_cluster_id : str, default None
Databricks cluster to be used to query the data as per SQL provided.
delta_table_write_mode: str, default "append"
Delta table write mode for storing data.
"""
"""
Parameters
----------
config : morpheus.config.Config
Pipeline configuration instance.
delta_path : str, default None
Path of the delta table where the data need to be written or updated.
databricks_host : str, default None
URL of Databricks host to connect to.
databricks_token : str, default None
Access token for Databricks cluster.
databricks_cluster_id : str, default None
Databricks cluster to be used to query the data as per SQL provided.
delta_table_write_mode: str, default "append"
Delta table write mode for storing data.
"""
module_config = builder.get_current_module_config()


delta_path = module_config.get("DELTA_PATH", None)
delta_table_write_mode = module_config.get("DELTA_WRITE_MODE", "append")
databricks_host=module_config.get("DATABRICKS_HOST", None)
databricks_token=module_config.get("DATABRICKS_TOKEN", None)
databricks_cluster_id=module_config.get("DATABRICKS_CLUSTER_ID", None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add error checking here for any config parameter that might cause the builder.remote call to fail later, this gives us more immediate feedback during module loading instead of runtime.


spark = DatabricksSession.builder.remote(host=databricks_host,token=databricks_token,cluster_id=databricks_cluster_id).getOrCreate()
def write_to_deltalake(message: MessageMeta):
df = message.copy_dataframe()
if isinstance(df, cudf.DataFrame):
df = df.to_pandas()
schema = _extract_schema_from_pandas_dataframe(df)
spark_df = spark.createDataFrame(df, schema=schema)
spark_df.write \
.format('delta') \
.option("mergeSchema", "true") \
.mode(delta_table_write_mode) \
.save(delta_path)
return message

def node_fn(obs: mrc.Observable, sub: mrc.Subscriber):
obs.pipe(ops.map(write_to_deltalake), ops.flatten(), ops.filter(lambda x: x is not None)).subscribe(sub)

node = builder.make_node(DATABRICKS_DELTALAKE_WRITER, mrc.core.operators.build(node_fn))

builder.register_module_input("input", node)
builder.register_module_output("output", node)
1 change: 1 addition & 0 deletions morpheus/utils/module_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
FILTER_CM_FAILED = "FilterCmFailed"
PAYLOAD_BATCHER = "PayloadBatcher"
WRITE_TO_ELASTICSEARCH = "WriteToElasticsearch"
DATABRICKS_DELTALAKE_WRITER="DatabricksDeltalakeWriter"
Loading