From be608fdd0782d34cbdd749e998cb0adab88bcbc5 Mon Sep 17 00:00:00 2001 From: Bhargav Suryadevara Date: Wed, 6 Sep 2023 14:36:48 -0500 Subject: [PATCH] Eliminate redundant code blocks in modules and stages (#1123) Eliminated redundant code blocks in both modules and stages, introducing controllers to enhance maintainability, and subsequently updated tests to align with these changes. - file_to_df - filter_detections - mlflow_model_writer - serializer - write_to_file Fixed preserve columns property issue. closes #965 #1074 Authors: - Bhargav Suryadevara (https://github.com/bsuryadevara) - Michael Demoret (https://github.com/mdemoret-nv) Approvers: - Christopher Harris (https://github.com/cwharris) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1123 --- .../morpheus/dfp/modules/__init__.py | 8 +- .../morpheus/dfp/modules/dfp_inference.py | 14 +- .../morpheus/dfp/modules/dfp_monitor.py | 2 +- .../morpheus/dfp/modules/dfp_training.py | 11 +- .../dfp/stages/dfp_file_batcher_stage.py | 6 +- .../morpheus/dfp/stages/dfp_file_to_df.py | 151 +-------- .../dfp/stages/dfp_mlflow_model_writer.py | 205 +----------- .../morpheus/dfp/utils/dfp_arg_parser.py | 3 +- .../morpheus/dfp/utils/model_cache.py | 46 +-- .../common/feature_extractor.py | 111 +++---- morpheus/controllers/file_to_df_controller.py | 237 ++++++++++++++ .../filter_detections_controller.py | 165 ++++++++++ .../mlflow_model_writer_controller.py | 305 ++++++++++++++++++ morpheus/controllers/monitor_controller.py | 235 ++++++++++++++ morpheus/controllers/serialize_controller.py | 135 ++++++++ .../controllers/write_to_file_controller.py | 136 ++++++++ morpheus/loaders/file_to_df_loader.py | 164 ++-------- morpheus/modules/file_to_df.py | 156 +-------- morpheus/modules/filter_detections.py | 121 ++----- morpheus/modules/mlflow_model_writer.py | 208 +----------- morpheus/modules/serialize.py | 67 +--- morpheus/modules/write_to_file.py | 72 +---- morpheus/stages/general/monitor_stage.py | 2 +- morpheus/stages/output/write_to_file_stage.py | 78 +---- .../postprocess/filter_detections_stage.py | 110 +------ .../stages/postprocess/serialize_stage.py | 71 +--- morpheus/utils/column_info.py | 34 +- morpheus/utils/monitor_utils.py | 215 ------------ morpheus/utils/schema_transforms.py | 24 +- .../test_dfp_file_to_df.py | 39 ++- .../test_dfp_mlflow_model_writer.py | 50 +-- tests/test_cli.py | 18 +- tests/test_filter_detections_stage.py | 20 +- tests/test_monitor_stage.py | 8 +- tests/test_serialize_stage.py | 12 +- tests/utils/nvt/test_schema_converters.py | 17 +- 36 files changed, 1590 insertions(+), 1666 deletions(-) create mode 100644 morpheus/controllers/file_to_df_controller.py create mode 100644 morpheus/controllers/filter_detections_controller.py create mode 100644 morpheus/controllers/mlflow_model_writer_controller.py create mode 100644 morpheus/controllers/monitor_controller.py create mode 100644 morpheus/controllers/serialize_controller.py create mode 100644 morpheus/controllers/write_to_file_controller.py diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/modules/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/modules/__init__.py index fd5169d061..549cf4c680 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/modules/__init__.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/modules/__init__.py @@ -17,17 +17,17 @@ # When segment modules are imported, they're added to the module registry. # To avoid flake8 warnings about unused code, the noqa flag is used during import. -from dfp.modules import dfp_monitor -from dfp.modules import dfp_split_users from dfp.modules import dfp_data_prep +from dfp.modules import dfp_deployment from dfp.modules import dfp_inference +from dfp.modules import dfp_inference_pipe +from dfp.modules import dfp_monitor from dfp.modules import dfp_postprocessing from dfp.modules import dfp_preproc from dfp.modules import dfp_rolling_window +from dfp.modules import dfp_split_users from dfp.modules import dfp_training -from dfp.modules import dfp_inference_pipe from dfp.modules import dfp_training_pipe -from dfp.modules import dfp_deployment __all__ = [ "dfp_monitor", diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_inference.py b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_inference.py index 48f8e41382..8fa9ce97de 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_inference.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_inference.py @@ -64,15 +64,23 @@ def dfp_inference(builder: mrc.Builder): model_name_formatter = config.get("model_name_formatter", None) fallback_user = config.get("fallback_username", "generic_user") - + model_fetch_timeout = config.get("model_fetch_timeout", 1.0) timestamp_column_name = config.get("timestamp_column_name", "timestamp") client = MlflowClient() - model_manager = ModelManager(model_name_formatter=model_name_formatter) + + model_manager = None def get_model(user: str) -> ModelCache: + nonlocal model_manager + + if not model_manager: + model_manager = ModelManager(model_name_formatter=model_name_formatter) - return model_manager.load_user_model(client, user_id=user, fallback_user_ids=[fallback_user]) + return model_manager.load_user_model(client, + user_id=user, + fallback_user_ids=[fallback_user], + timeout=model_fetch_timeout) def process_task(control_message: ControlMessage): start_time = time.time() diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_monitor.py b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_monitor.py index 5f70a92695..7706af78c3 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_monitor.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_monitor.py @@ -21,9 +21,9 @@ from mrc.core import operators as ops from tqdm import tqdm +from morpheus.controllers.monitor_controller import MonitorController from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module -from morpheus.utils.monitor_utils import MonitorController from morpheus.utils.monitor_utils import MorpheusTqdm from morpheus.utils.monitor_utils import SilentMorpheusTqdm diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py index ec8ff30db5..aec5f9a2dc 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/modules/dfp_training.py @@ -16,6 +16,7 @@ import mrc from mrc.core import operators as ops +from sklearn.model_selection import train_test_split import cudf @@ -87,8 +88,16 @@ def on_data(control_message: ControlMessage): # Only train on the feature columns train_df = final_df[final_df.columns.intersection(feature_columns)] + validation_df = None + run_validation = False + + # Split into training and validation sets + if validation_size > 0.0: + train_df, validation_df = train_test_split(train_df, test_size=validation_size, shuffle=False) + run_validation = True + logger.debug("Training AE model for user: '%s'...", user_id) - model.fit(train_df, epochs=epochs) + model.fit(train_df, epochs=epochs, val_data=validation_df, run_validation=run_validation) logger.debug("Training AE model for user: '%s'... Complete.", user_id) dfp_mm = DFPMessageMeta(cudf.from_pandas(final_df), user_id=user_id) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py index 271acc4833..7a9eee94af 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py @@ -46,7 +46,7 @@ class DFPFileBatcherStage(SinglePortStage): Parameters ---------- - c : `morpheus.config.Config` + config : `morpheus.config.Config` Pipeline configuration instance. date_conversion_func : callable A function that takes a file object and returns a `datetime` object representing the date of the file. @@ -69,14 +69,14 @@ class DFPFileBatcherStage(SinglePortStage): """ def __init__(self, - c: Config, + config: Config, date_conversion_func: typing.Callable[[fsspec.core.OpenFile], datetime], period: str = "D", sampling_rate_s: typing.Optional[int] = None, start_time: datetime = None, end_time: datetime = None, sampling: typing.Union[str, float, int, None] = None): - super().__init__(c) + super().__init__(config) self._date_conversion_func = date_conversion_func self._period = period diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py index ec0ac35a09..a8c37ae9b6 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py @@ -13,62 +13,24 @@ # limitations under the License. """Stage for converting fsspec file objects to a DataFrame.""" -import hashlib -import json import logging -import os -import time import typing -from functools import partial -import fsspec import mrc import pandas as pd from mrc.core import operators as ops from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.io.deserializers import read_file_to_df +from morpheus.controllers.file_to_df_controller import FileToDFController from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stream_pair import StreamPair from morpheus.utils.column_info import DataFrameInputSchema -from morpheus.utils.column_info import process_dataframe -from morpheus.utils.downloader import Downloader logger = logging.getLogger(f"morpheus.{__name__}") -def _single_object_to_dataframe(file_object: fsspec.core.OpenFile, - schema: DataFrameInputSchema, - file_type: FileTypes, - filter_null: bool, - parser_kwargs: dict) -> pd.DataFrame: - retries = 0 - df = None - while (retries < 2): - try: - with file_object as f: - df = read_file_to_df(f, - file_type, - filter_nulls=filter_null, - df_type="pandas", - parser_kwargs=parser_kwargs) - - break - except Exception as e: - if (retries < 2): - logger.warning("Error fetching %s: %s\nRetrying...", file_object, e) - retries += 1 - - # Optimistaclly prep the dataframe (Not necessary since this will happen again in process_dataframe, but it - # increases performance significantly) - if (schema.prep_dataframe is not None): - df = schema.prep_dataframe(df) - - return df - - class DFPFileToDataFrameStage(PreallocatorMixin, SinglePortStage): """ Stage for converting fsspec file objects to a DataFrame, pre-processing the DataFrame according to `schema`, and @@ -102,14 +64,12 @@ def __init__(self, cache_dir: str = "./.cache/dfp"): super().__init__(config) - self._schema = schema - - self._file_type = file_type - self._filter_null = filter_null - self._parser_kwargs = {} if parser_kwargs is None else parser_kwargs - self._cache_dir = os.path.join(cache_dir, "file_cache") - - self._downloader = Downloader() + self._controller = FileToDFController(schema=schema, + filter_null=filter_null, + file_type=file_type, + parser_kwargs=parser_kwargs, + cache_dir=cache_dir, + timestamp_column_name=config.ae.timestamp_column_name) @property def name(self) -> str: @@ -124,103 +84,10 @@ def accepted_types(self) -> typing.Tuple: """Accepted input types.""" return (typing.Any, ) - def _get_or_create_dataframe_from_batch( - self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[pd.DataFrame, bool]: - - if (not file_object_batch): - raise RuntimeError("No file objects to process") - - file_list = file_object_batch[0] - batch_count = file_object_batch[1] - - file_system: fsspec.AbstractFileSystem = file_list.fs - - # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just - # hashes all the output of `info()` which is perfect - hash_data = [{"ukey": file_system.ukey(file_object.path)} for file_object in file_list] - - # Convert to base 64 encoding to remove - values - objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() - - batch_cache_location = os.path.join(self._cache_dir, "batches", f"{objects_hash_hex}.pkl") - - # Return the cache if it exists - if (os.path.exists(batch_cache_location)): - output_df = pd.read_pickle(batch_cache_location) - output_df["batch_count"] = batch_count - output_df["origin_hash"] = objects_hash_hex - - return (output_df, True) - - # Cache miss - download_method = partial(_single_object_to_dataframe, - schema=self._schema, - file_type=self._file_type, - filter_null=self._filter_null, - parser_kwargs=self._parser_kwargs) - - download_buckets = file_list - - # Loop over dataframes and concat into one - try: - dfs = self._downloader.download(download_buckets, download_method) - except Exception: - logger.exception("Failed to download logs. Error: ", exc_info=True) - raise - - if (dfs is None or len(dfs) == 0): - raise ValueError("No logs were downloaded") - - output_df: pd.DataFrame = pd.concat(dfs) - output_df = process_dataframe(df_in=output_df, input_schema=self._schema) - - # Finally sort by timestamp and then reset the index - output_df.sort_values(by=[self._config.ae.timestamp_column_name], inplace=True) - - output_df.reset_index(drop=True, inplace=True) - - # Save dataframe to cache future runs - os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) - - try: - output_df.to_pickle(batch_cache_location) - except Exception: - logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) - - output_df["batch_count"] = batch_count - output_df["origin_hash"] = objects_hash_hex - - return (output_df, False) - - def convert_to_dataframe(self, fsspec_batch: typing.Tuple[fsspec.core.OpenFiles, int]): - """Converts a batch of fsspec objects to a DataFrame.""" - if (not fsspec_batch): - return None - - start_time = time.time() - - try: - - output_df, cache_hit = self._get_or_create_dataframe_from_batch(fsspec_batch) - - duration = (time.time() - start_time) * 1000.0 - - if (output_df is not None and logger.isEnabledFor(logging.DEBUG)): - logger.debug("fsspec objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms, Rate: %s rows/s", - len(output_df), - "hit" if cache_hit else "miss", - duration, - len(output_df) / (duration / 1000.0)) - - return output_df - except Exception: - logger.exception("Error while converting fsspec batch to DF.") - raise - def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair: stream = builder.make_node(self.unique_name, - ops.map(self.convert_to_dataframe), - ops.on_completed(self._downloader.close)) + ops.map(self._controller.convert_to_dataframe), + ops.on_completed(self._controller.close)) builder.make_edge(input_stream[0], stream) return stream, pd.DataFrame diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py index 240a329065..3daba9b6c2 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py @@ -13,35 +13,18 @@ # limitations under the License. """Publishes models into MLflow""" -import hashlib import logging -import os import typing -import urllib.parse -import mlflow import mrc -import requests -from mlflow.exceptions import MlflowException -from mlflow.models.signature import ModelSignature -from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS -from mlflow.protos.databricks_pb2 import ErrorCode -from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository -from mlflow.tracking import MlflowClient -from mlflow.types import ColSpec -from mlflow.types import Schema -from mlflow.types.utils import _infer_pandas_column -from mlflow.types.utils import _infer_schema from mrc.core import operators as ops from morpheus.config import Config +from morpheus.controllers.mlflow_model_writer_controller import MLFlowModelWriterController from morpheus.messages.multi_ae_message import MultiAEMessage -from morpheus.models.dfencoder import AutoEncoder from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stream_pair import StreamPair -from ..utils.model_cache import user_to_model_name - # Setup conda environment conda_env = { 'channels': ['defaults', 'conda-forge'], @@ -70,18 +53,24 @@ class DFPMLFlowModelWriterStage(SinglePortStage): the field names have been applied. databricks_permissions : dict, optional When not `None` sets permissions needed when using a databricks hosted MLflow server. + timeout : float, optional + Timeout for get requests. """ def __init__(self, c: Config, model_name_formatter: str = "dfp-{user_id}", experiment_name_formatter: str = "/dfp-models/{reg_model_name}", - databricks_permissions: dict = None): + databricks_permissions: dict = None, + timeout=1.0): super().__init__(c) - self._model_name_formatter = model_name_formatter - self._experiment_name_formatter = experiment_name_formatter - self._databricks_permissions = databricks_permissions + self._controller = MLFlowModelWriterController(model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter, + databricks_permissions=databricks_permissions, + conda_env=conda_env, + timeout=timeout, + timestamp_column_name=c.ae.timestamp_column_name) @property def name(self) -> str: @@ -96,178 +85,8 @@ def accepted_types(self) -> typing.Tuple: """Types accepted by this stage""" return (MultiAEMessage, ) - def user_id_to_model(self, user_id: str) -> str: - """Converts a user ID to a model name""" - return user_to_model_name(user_id=user_id, model_name_formatter=self._model_name_formatter) - - def user_id_to_experiment(self, user_id: str) -> str: - """Converts a user ID to an experiment name""" - kwargs = { - "user_id": user_id, - "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), - "reg_model_name": self.user_id_to_model(user_id=user_id) - } - - return self._experiment_name_formatter.format(**kwargs) - - def _apply_model_permissions(self, reg_model_name: str): - - # Check the required variables - databricks_host = os.environ.get("DATABRICKS_HOST", None) - databricks_token = os.environ.get("DATABRICKS_TOKEN", None) - - if (databricks_host is None or databricks_token is None): - raise RuntimeError("Cannot set Databricks model permissions. " - "Environment variables `DATABRICKS_HOST` and `DATABRICKS_TOKEN` must be set") - - headers = {"Authorization": f"Bearer {databricks_token}"} - - url_base = f"{databricks_host}" - - try: - # First get the registered model ID - get_registered_model_url = urllib.parse.urljoin(url_base, - "/api/2.0/mlflow/databricks/registered-models/get") - - get_registered_model_response = requests.get(url=get_registered_model_url, - headers=headers, - params={"name": reg_model_name}, - timeout=10) - - registered_model_response = get_registered_model_response.json() - - reg_model_id = registered_model_response["registered_model_databricks"]["id"] - - # Now apply the permissions. If it exists already, it will be overwritten or it is a no-op - patch_registered_model_permissions_url = urllib.parse.urljoin( - url_base, f"/api/2.0/preview/permissions/registered-models/{reg_model_id}") - - patch_registered_model_permissions_body = { - "access_control_list": [{ - "group_name": group, "permission_level": permission - } for group, - permission in self._databricks_permissions.items()] - } - - requests.patch(url=patch_registered_model_permissions_url, - headers=headers, - json=patch_registered_model_permissions_body, - timeout=10) - - except Exception: - logger.exception("Error occurred trying to apply model permissions to model: %s", - reg_model_name, - exc_info=True) - - def on_data(self, message: MultiAEMessage): - """Stores incoming models into MLflow.""" - user = message.meta.user_id - - model: AutoEncoder = message.model - - model_path = "dfencoder" - reg_model_name = self.user_id_to_model(user_id=user) - - # Write to ML Flow - try: - mlflow.end_run() - - experiment_name = self.user_id_to_experiment(user_id=user) - - # Creates a new experiment if it doesn't exist - experiment = mlflow.set_experiment(experiment_name) - - with mlflow.start_run(run_name="autoencoder model training run", - experiment_id=experiment.experiment_id) as run: - - model_path = f"{model_path}-{run.info.run_uuid}" - - # Log all params in one dict to avoid round trips - mlflow.log_params({ - "Algorithm": "Denosing Autoencoder", - "Epochs": model.lr_decay.state_dict().get("last_epoch", "unknown"), - "Learning rate": model.lr, - "Batch size": model.batch_size, - "Start Epoch": message.get_meta(self._config.ae.timestamp_column_name).min(), - "End Epoch": message.get_meta(self._config.ae.timestamp_column_name).max(), - "Log Count": message.mess_count, - }) - - metrics_dict: typing.Dict[str, float] = {} - - # Add info on the embeddings - for key, value in model.categorical_fts.items(): - embedding = value.get("embedding", None) - - if (embedding is None): - continue - - metrics_dict[f"embedding-{key}-num_embeddings"] = embedding.num_embeddings - metrics_dict[f"embedding-{key}-embedding_dim"] = embedding.embedding_dim - - mlflow.log_metrics(metrics_dict) - - # Use the prepare_df function to setup the direct inputs to the model. Only include features returned by - # prepare_df to show the actual inputs to the model (any extra are discarded) - input_df = message.get_meta().iloc[0:1] - prepared_df = model.prepare_df(input_df) - output_values = model.get_anomaly_score(input_df) - - input_schema = Schema([ - ColSpec(type=_infer_pandas_column(input_df[col_name]), name=col_name) - for col_name in list(prepared_df.columns) - ]) - output_schema = _infer_schema(output_values) - - model_sig = ModelSignature(inputs=input_schema, outputs=output_schema) - - model_info = mlflow.pytorch.log_model( - pytorch_model=model, - artifact_path=model_path, - conda_env=conda_env, - signature=model_sig, - ) - - client = MlflowClient() - - # First ensure a registered model has been created - try: - create_model_response = client.create_registered_model(reg_model_name) - logger.debug("Successfully registered model '%s'.", create_model_response.name) - except MlflowException as e: - if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): - pass - else: - raise e - - # If we are using databricks, make sure we set the correct permissions - if (self._databricks_permissions is not None and mlflow.get_tracking_uri() == "databricks"): - # Need to apply permissions - self._apply_model_permissions(reg_model_name=reg_model_name) - - model_src = RunsArtifactRepository.get_underlying_uri(model_info.model_uri) - - tags = { - "start": message.get_meta(self._config.ae.timestamp_column_name).min(), - "end": message.get_meta(self._config.ae.timestamp_column_name).max(), - "count": message.get_meta(self._config.ae.timestamp_column_name).count() - } - - # Now create the model version - model_version = client.create_model_version(name=reg_model_name, - source=model_src, - run_id=run.info.run_id, - tags=tags) - - logger.debug("ML Flow model upload complete: %s:%s:%s", user, reg_model_name, model_version.version) - - except Exception: - logger.exception("Error uploading model to ML Flow", exc_info=True) - - return message - def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair: - stream = builder.make_node(self.unique_name, ops.map(self.on_data)) + stream = builder.make_node(self.unique_name, ops.map(self._controller.on_data)) builder.make_edge(input_stream[0], stream) return stream, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/dfp_arg_parser.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/dfp_arg_parser.py index 6bf71a0a3d..4b807443ad 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/utils/dfp_arg_parser.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/dfp_arg_parser.py @@ -24,7 +24,7 @@ from morpheus.utils.logger import configure_logging -logger = logging.getLogger(__name__) +logger = logging.getLogger(f"morpheus.{__name__}") @dataclass @@ -95,6 +95,7 @@ def time_fields(self): def silence_monitors(self): return self._silence_monitors + @property @verify_init def include_generic(self): return self._include_generic diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py index 2a0da79752..ffc5304e5b 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py @@ -131,13 +131,13 @@ def __init__(self, manager: "ModelManager", user_id: str, fallback_user_ids: typ self._lock = threading.RLock() self._child_user_model_cache: UserModelMap = None - def load_model_cache(self, client) -> ModelCache: + def load_model_cache(self, client, timeout: float = 1.0) -> ModelCache: now = datetime.now() # Lock to prevent additional access try: - with timed_acquire(self._lock, timeout=1.0): + with timed_acquire(self._lock, timeout=timeout): # Check if we have checked before or if we need to check again if (self._last_checked is None or (now - self._last_checked).seconds < self._manager.cache_timeout_sec): @@ -146,22 +146,26 @@ def load_model_cache(self, client) -> ModelCache: self._last_checked = now # Try to load from the manager - model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + model_cache = self._manager.load_model_cache(client=client, + reg_model_name=self._reg_model_name, + timeout=timeout) # If we have a hit, there is nothing else to do if (model_cache is None and len(self._fallback_user_ids) > 0): # Our model does not exist, use fallback self._child_user_model_cache = self._manager.load_user_model_cache( - self._fallback_user_ids[0], fallback_user_ids=self._fallback_user_ids[1:]) + self._fallback_user_ids[0], timeout, fallback_user_ids=self._fallback_user_ids[1:]) else: return model_cache # See if we have a child cache and use that if (self._child_user_model_cache is not None): - return self._child_user_model_cache.load_model_cache(client=client) + return self._child_user_model_cache.load_model_cache(client=client, timeout=timeout) # Otherwise load the model - model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + model_cache = self._manager.load_model_cache(client=client, + reg_model_name=self._reg_model_name, + timeout=timeout) if (model_cache is None): raise RuntimeError(f"Model was found but now no longer exists. Model: {self._reg_model_name}") @@ -197,7 +201,7 @@ def __init__(self, model_name_formatter: str) -> None: def cache_timeout_sec(self): return self._cache_timeout_sec - def _model_exists(self, reg_model_name: str) -> bool: + def _model_exists(self, reg_model_name: str, timeout: float = 1.0) -> bool: now = datetime.now() @@ -205,7 +209,7 @@ def _model_exists(self, reg_model_name: str) -> bool: if ((now - self._existing_models_updated).seconds > self._cache_timeout_sec): try: - with timed_acquire(self._model_cache_lock, timeout=1.0): + with timed_acquire(self._model_cache_lock, timeout=timeout): logger.debug("Updating list of available models...") client = MlflowClient() @@ -241,22 +245,28 @@ def _model_exists(self, reg_model_name: str) -> bool: def user_id_to_model(self, user_id: str): return user_to_model_name(user_id=user_id, model_name_formatter=self._model_name_formatter) - def load_user_model(self, client, user_id: str, fallback_user_ids: typing.List[str] = None) -> ModelCache: + def load_user_model(self, + client, + user_id: str, + fallback_user_ids: typing.List[str], + timeout: float = 1.0) -> ModelCache: - if (fallback_user_ids is None): + if fallback_user_ids is None: fallback_user_ids = [] # First get the UserModel - user_model_cache = self.load_user_model_cache(user_id=user_id, fallback_user_ids=fallback_user_ids) + user_model_cache = self.load_user_model_cache(user_id=user_id, + timeout=timeout, + fallback_user_ids=fallback_user_ids) - return user_model_cache.load_model_cache(client=client) + return user_model_cache.load_model_cache(client=client, timeout=timeout) - def load_model_cache(self, client: MlflowClient, reg_model_name: str) -> ModelCache: + def load_model_cache(self, client: MlflowClient, reg_model_name: str, timeout: float = 1.0) -> ModelCache: now = datetime.now() try: - with timed_acquire(self._model_cache_lock, timeout=1.0): + with timed_acquire(self._model_cache_lock, timeout=timeout): model_cache = self._model_cache.get(reg_model_name, None) @@ -267,7 +277,7 @@ def load_model_cache(self, client: MlflowClient, reg_model_name: str) -> ModelCa # Cache miss. Try to check for a model try: - if (not self._model_exists(reg_model_name)): + if (not self._model_exists(reg_model_name, timeout)): # Break early return None @@ -323,12 +333,10 @@ def load_model_cache(self, client: MlflowClient, reg_model_name: str) -> ModelCa logger.error("Deadlock when trying to acquire model cache lock") raise RuntimeError("Deadlock when trying to acquire model cache lock") from e - def load_user_model_cache(self, user_id: str, fallback_user_ids: typing.List[str] = None) -> UserModelMap: - if (fallback_user_ids is None): - fallback_user_ids = [] + def load_user_model_cache(self, user_id: str, timeout: float, fallback_user_ids: typing.List[str]) -> UserModelMap: try: - with timed_acquire(self._user_model_cache_lock, timeout=1.0): + with timed_acquire(self._user_model_cache_lock, timeout=timeout): if (user_id not in self._user_model_cache): self._user_model_cache[user_id] = UserModelMap(manager=self, diff --git a/examples/ransomware_detection/common/feature_extractor.py b/examples/ransomware_detection/common/feature_extractor.py index d8b579d128..46df5c9181 100644 --- a/examples/ransomware_detection/common/feature_extractor.py +++ b/examples/ransomware_detection/common/feature_extractor.py @@ -15,6 +15,7 @@ import typing import pandas as pd + from common.data_models import FeatureConfig from common.data_models import ProtectionData from common.feature_constants import FeatureConstants as fc @@ -110,59 +111,59 @@ def _extract_threadlist(self, x: pd.DataFrame): wait_reason_df = x[x.WaitReason == wait_reason] self._features['threadlist_df_wait_reason_' + wait_reason] = len(wait_reason_df) - def _extract_vad_cc(self, cc: pd.Series): + def _extract_vad_cc(self, commit_charge: pd.Series): """ This function extracts 'vad' specific commit charge features. """ - cc_size = len(cc) + cc_size = len(commit_charge) # Calculate mean, max, sum of commit charged of vad if cc_size: - self._features['get_commit_charge_mean_vad'] = cc.mean() - self._features['get_commit_charge_max_vad'] = cc.max() - self._features['get_commit_charge_sum_vad'] = cc.sum() + self._features['get_commit_charge_mean_vad'] = commit_charge.mean() + self._features['get_commit_charge_max_vad'] = commit_charge.max() + self._features['get_commit_charge_sum_vad'] = commit_charge.sum() - def _extract_cc(self, cc: pd.Series): + def _extract_cc(self, commit_charge: pd.Series): """ This function extracts commit charge features. """ - cc_size = len(cc) + cc_size = len(commit_charge) # Calculate mean, max, sum, len of the commit charged if cc_size: - self._features['get_commit_charge_mean'] = cc.mean() - self._features['get_commit_charge_max'] = cc.max() - self._features['get_commit_charge_sum'] = cc.sum() + self._features['get_commit_charge_mean'] = commit_charge.mean() + self._features['get_commit_charge_max'] = commit_charge.max() + self._features['get_commit_charge_sum'] = commit_charge.sum() self._features['get_commit_charge_len'] = cc_size - def _extract_vads_cc(self, cc: pd.Series, vads_cc: pd.Series): + def _extract_vads_cc(self, commit_charge: pd.Series, vads_cc: pd.Series): """ This function extracts 'vads' commit charge features. """ - cc_size = len(cc) + cc_size = len(commit_charge) # Calculate min of commit charged of vads if cc_size: - self._features['get_commit_charge_min_vads'] = cc.min() + self._features['get_commit_charge_min_vads'] = commit_charge.min() # Calculate the amount of entire memory commit charged of vads - cc = vads_cc[vads_cc == fc.FULL_MEMORY_ADDRESS] - self._features['count_entire_commit_charge_vads'] = len(cc) + commit_charge = vads_cc[vads_cc == fc.FULL_MEMORY_ADDRESS] + self._features['count_entire_commit_charge_vads'] = len(commit_charge) - def _extract_cc_vad_page_noaccess(self, cc: pd.Series): + def _extract_cc_vad_page_noaccess(self, commit_charge: pd.Series): """ This function extracts 'vad' commit charge features specific to 'page_noaccess' protection. """ - cc = cc[cc < fc.FULL_MEMORY_ADDRESS] + commit_charge = commit_charge[commit_charge < fc.FULL_MEMORY_ADDRESS] # Calculate min and mean of commit charged of vad memory with PAGE_NOACCESS protection - if not cc.empty: - self._features['get_commit_charge_min_vad_page_noaccess'] = cc.min() - self._features['get_commit_charge_mean_vad_page_noaccess'] = cc.mean() + if not commit_charge.empty: + self._features['get_commit_charge_min_vad_page_noaccess'] = commit_charge.min() + self._features['get_commit_charge_mean_vad_page_noaccess'] = commit_charge.mean() def _extract_unique_file_extns(self, x: pd.DataFrame): """ @@ -210,20 +211,20 @@ def _extract_vadinfo(self, x: pd.DataFrame): self._features['ratio_private_memory'] = (vad_private_memory_len / vad_size) self._features['vad_ratio'] = (vadinfo_size / vad_size) - cc = x[x.CommitCharge < fc.FULL_MEMORY_ADDRESS].CommitCharge - self._extract_cc(cc) + commit_charge = x[x.CommitCharge < fc.FULL_MEMORY_ADDRESS].CommitCharge + self._extract_cc(commit_charge) # calculating the amount of commit charged of vad - cc = vad_cc[vad_cc < fc.FULL_MEMORY_ADDRESS] - self._extract_vad_cc(cc) + commit_charge = vad_cc[vad_cc < fc.FULL_MEMORY_ADDRESS] + self._extract_vad_cc(commit_charge) # Calculate the amount of commit charged of vads - cc = vads_cc[vads_cc < fc.FULL_MEMORY_ADDRESS] - self._extract_vads_cc(cc, vads_cc) + commit_charge = vads_cc[vads_cc < fc.FULL_MEMORY_ADDRESS] + self._extract_vads_cc(commit_charge, vads_cc) # calculating commit charged of memory with PAGE_NOACCESS protection - cc = x[(x.Protection == fc.PAGE_NOACCESS) & (x.Tag == fc.VAD)].CommitCharge - self._extract_cc_vad_page_noaccess(cc) + commit_charge = x[(x.Protection == fc.PAGE_NOACCESS) & (x.Tag == fc.VAD)].CommitCharge + self._extract_cc_vad_page_noaccess(commit_charge) self._extract_protections(x, vad_size, vadsinfo_size, vadinfo_size) @@ -240,15 +241,15 @@ def _get_protection_data(self, """ protection_df = x[x.Protection == protection] - cc = protection_df.CommitCharge - cc = cc[cc < fc.FULL_MEMORY_ADDRESS] + commit_charge = protection_df.CommitCharge + commit_charge = commit_charge[commit_charge < fc.FULL_MEMORY_ADDRESS] vads_protection_size = len(protection_df[protection_df.Tag == fc.VADS]) vad_protection_size = len(protection_df[protection_df.Tag == fc.VAD]) - commit_charge_size = len(cc) + commit_charge_size = len(commit_charge) protection_df_size = len(protection_df) protection_id = fc.PROTECTIONS[protection] - p_data = ProtectionData(cc, + p_data = ProtectionData(commit_charge, vads_protection_size, vad_protection_size, commit_charge_size, @@ -265,14 +266,14 @@ def _page_execute_readwrite(self, x: ProtectionData): This function extracts 'page_execute_readwrite' protection reelated features. """ - cc = x.commit_charges + commit_charge = x.commit_charges if x.commit_charge_size: - self._features['get_commit_charge_mean_page_execute_readwrite'] = cc.mean() - self._features['get_commit_charge_min_page_execute_readwrite'] = cc.min() - self._features['get_commit_charge_max_page_execute_readwrite'] = cc.max() - self._features['get_commit_charge_sum_page_execute_readwrite'] = cc.sum() - self._features['get_commit_charge_std_page_execute_readwrite'] = cc.std(ddof=0) + self._features['get_commit_charge_mean_page_execute_readwrite'] = commit_charge.mean() + self._features['get_commit_charge_min_page_execute_readwrite'] = commit_charge.min() + self._features['get_commit_charge_max_page_execute_readwrite'] = commit_charge.max() + self._features['get_commit_charge_sum_page_execute_readwrite'] = commit_charge.sum() + self._features['get_commit_charge_std_page_execute_readwrite'] = commit_charge.std(ddof=0) # Calculate amount and ratio of memory pages with 'PAGE_EXECUTE_READWRITE protection if x.protection_df_size: @@ -289,13 +290,13 @@ def _page_noaccess(self, x: ProtectionData): This function extracts 'page_noaccess' protection reelated features. """ - cc = x.commit_charges + commit_charge = x.commit_charges if x.commit_charge_size: - self._features['get_commit_charge_mean_page_no_access'] = cc.mean() - self._features['get_commit_charge_min_page_no_access'] = cc.min() - self._features['get_commit_charge_max_page_no_access'] = cc.max() - self._features['get_commit_charge_sum_page_no_access'] = cc.sum() + self._features['get_commit_charge_mean_page_no_access'] = commit_charge.mean() + self._features['get_commit_charge_min_page_no_access'] = commit_charge.min() + self._features['get_commit_charge_max_page_no_access'] = commit_charge.max() + self._features['get_commit_charge_sum_page_no_access'] = commit_charge.sum() # Calculate amount and ratio of memory pages with 'PAGE_NOACCESS' protection if x.protection_df_size: @@ -317,12 +318,12 @@ def _page_execute_writecopy(self, x: ProtectionData): This function extracts 'page_execute_writecopy' protection reelated features. """ - cc = x.commit_charges + commit_charge = x.commit_charges # Calculate min and sum of commit charged with memory pages with 'PAGE_EXECUTE_WRITECOPY' protection if x.commit_charge_size: - self._features['get_commit_charge_min_page_execute_writecopy'] = cc.min() - self._features['get_commit_charge_sum_page_execute_writecopy'] = cc.sum() + self._features['get_commit_charge_min_page_execute_writecopy'] = commit_charge.min() + self._features['get_commit_charge_sum_page_execute_writecopy'] = commit_charge.sum() # Calculate amount and ratio of vad memory pages with 'PAGE_EXECUTE_WRITECOPY' protection self._features['page_execute_writecopy_vad_count'] = x.vad_protection_size @@ -334,11 +335,11 @@ def _page_readonly(self, x: ProtectionData): This function extracts 'page_readonly' protection reelated features. """ - cc = x.commit_charges + commit_charge = x.commit_charges # Calculate mean of commit charged with memory pages with 'PAGE_READONLY' protection if x.commit_charge_size: - self._features['get_commit_charge_mean_page_readonly'] = cc.mean() + self._features['get_commit_charge_mean_page_readonly'] = commit_charge.mean() # Calculate amount and ratio of memory pages with 'PAGE_READONLY' protection if x.protection_df_size: @@ -380,7 +381,7 @@ def _extract_protections(self, x: pd.DataFrame, vadinfo_df_size: int, vadsinfo_s """ page_execute_writecopy_count = 0 - for protection in fc.PROTECTIONS.keys(): + for protection in fc.PROTECTIONS: p_data = self._get_protection_data(x, protection, vadinfo_df_size, vadsinfo_size, vadinfo_size) @@ -422,16 +423,16 @@ def _extract_handle_types(self, x: pd.DataFrame): """ # Get count and ratio for the handles by their type. - for t in (fc.HANDLES_TYPES + fc.HANDLES_TYPES_2): + for h_type in (fc.HANDLES_TYPES + fc.HANDLES_TYPES_2): - df = x[x.Type == t[0]] + df = x[x.Type == h_type[0]] df_len = len(df) - if t in fc.HANDLES_TYPES: - col = 'handles_df_' + t[1] + '_count' + if h_type in fc.HANDLES_TYPES: + col = 'handles_df_' + h_type[1] + '_count' self._features[col] = df_len - col = 'handles_df_' + t[1] + '_ratio' + col = 'handles_df_' + h_type[1] + '_ratio' self._features[col] = df_len / (self._features['handles_df_count'] + 1) def _extract_file_handle_dirs(self, file_paths: pd.Series): @@ -559,7 +560,7 @@ def extract_features(self, x: pd.DataFrame, feas_all_zeros: typing.Dict[str, int handles_df = fltr_plugin_dict['handles'] except KeyError as e: - raise KeyError('Missing required plugins: %s' % (e)) + raise KeyError(f'Missing required plugins: {e}') from e # Envars plugin features displays a process's environment variables. # Typically this will show the number of CPUs installed and the hardware architecture, diff --git a/morpheus/controllers/file_to_df_controller.py b/morpheus/controllers/file_to_df_controller.py new file mode 100644 index 0000000000..2839f4e3c2 --- /dev/null +++ b/morpheus/controllers/file_to_df_controller.py @@ -0,0 +1,237 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Morpheus pipeline module for fetching files and emitting them as DataFrames.""" + +import hashlib +import json +import logging +import os +import time +import typing +from functools import partial + +import fsspec +import pandas as pd + +import cudf + +from morpheus.common import FileTypes +from morpheus.io.deserializers import read_file_to_df +from morpheus.utils.column_info import DataFrameInputSchema +from morpheus.utils.column_info import PreparedDFInfo +from morpheus.utils.column_info import process_dataframe +from morpheus.utils.downloader import Downloader + +logger = logging.getLogger(__name__) + + +def single_object_to_dataframe(file_object: fsspec.core.OpenFile, + schema: DataFrameInputSchema, + file_type: FileTypes, + filter_null: bool, + parser_kwargs: dict) -> pd.DataFrame: + """ + Converts a file object into a Pandas DataFrame with optional preprocessing. + + Parameters + ---------- + file_object : `fsspec.core.OpenFile` + A file object, typically from a remote storage system. + schema : `morpheus.utils.column_info.DataFrameInputSchema` + A schema defining how to process the data. + file_type : `morpheus.common.FileTypes` + The type of the file being processed (e.g., CSV, Parquet). + filter_null : bool + Flag to indicate whether to filter out null values. + parser_kwargs : dict + Additional keyword arguments to pass to the file parser. + + Returns + ------- + pd.DataFrame: The resulting Pandas DataFrame after processing and optional preprocessing. + """ + + retries = 0 + df = None + while (retries < 2): + try: + with file_object as f: + df = read_file_to_df(f, + file_type, + filter_nulls=filter_null, + df_type="pandas", + parser_kwargs=parser_kwargs) + + break + except Exception as e: + if (retries < 2): + logger.warning("Error fetching %s: %s\nRetrying...", file_object, e) + retries += 1 + + # Optimistaclly prep the dataframe (Not necessary since this will happen again in process_dataframe, but it + # increases performance significantly) + if (schema.prep_dataframe is not None): + prepared_df_info: PreparedDFInfo = schema.prep_dataframe(df) + + return prepared_df_info.df + + +class FileToDFController: + """ + Controller class for converting file objects to Pandas DataFrames with optional preprocessing. + + Parameters + ---------- + schema : DataFrameInputSchema + A schema defining how to process the data. + filter_null : bool + Flag to indicate whether to filter out null values. + file_type : FileTypes + The type of the file being processed (e.g., CSV, Parquet). + parser_kwargs : dict + Additional keyword arguments to pass to the file parser. + cache_dir : str + Directory where cache will be stored. + timestamp_column_name : str + Name of the timestamp column. + """ + + def __init__(self, + schema: DataFrameInputSchema, + filter_null: bool, + file_type: FileTypes, + parser_kwargs: dict, + cache_dir: str, + timestamp_column_name: str): + + self._schema = schema + self._file_type = file_type + self._filter_null = filter_null + self._parser_kwargs = {} if parser_kwargs is None else parser_kwargs + self._cache_dir = os.path.join(cache_dir, "file_cache") + self._timestamp_column_name = timestamp_column_name + + self._downloader = Downloader() + + def _get_or_create_dataframe_from_batch( + self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: + + if (not file_object_batch): + raise RuntimeError("No file objects to process") + + file_list = file_object_batch[0] + batch_count = file_object_batch[1] + + file_system: fsspec.AbstractFileSystem = file_list.fs + + # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just + # hashes all of the output of `info()` which is perfect + hash_data = [{"ukey": file_system.ukey(file_object.path)} for file_object in file_list] + + # Convert to base 64 encoding to remove - values + objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() + + batch_cache_location = os.path.join(self._cache_dir, "batches", f"{objects_hash_hex}.pkl") + + # Return the cache if it exists + if (os.path.exists(batch_cache_location)): + output_df = pd.read_pickle(batch_cache_location) + output_df["batch_count"] = batch_count + output_df["origin_hash"] = objects_hash_hex + + return (output_df, True) + + # Cache miss + download_method_func = partial(single_object_to_dataframe, + file_type=self._file_type, + schema=self._schema, + filter_null=self._filter_null, + parser_kwargs=self._parser_kwargs) + + download_buckets = file_list + + # Loop over dataframes and concat into one + try: + dfs = self._downloader.download(download_buckets, download_method_func) + except Exception: + logger.exception("Failed to download logs. Error: ", exc_info=True) + raise + + if (dfs is None or len(dfs) == 0): + raise ValueError("No logs were downloaded") + + output_df: pd.DataFrame = pd.concat(dfs) + + output_df = process_dataframe(df_in=output_df, input_schema=self._schema) + + # Finally sort by timestamp and then reset the index + output_df.sort_values(by=[self._timestamp_column_name], inplace=True) + + output_df.reset_index(drop=True, inplace=True) + + # Save dataframe to cache future runs + os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) + + try: + output_df.to_pickle(batch_cache_location) + except Exception: + logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) + + output_df["batch_count"] = batch_count + output_df["origin_hash"] = objects_hash_hex + + return (output_df, False) + + def convert_to_dataframe(self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> pd.DataFrame: + """ + Convert a batch of file objects to a DataFrame. + + Parameters + ---------- + file_object_batch : typing.Tuple[fsspec.core.OpenFiles, int] + A batch of file objects and batch count. + + Returns + ------- + cudf.DataFrame + The resulting DataFrame. + """ + + if (not file_object_batch): + return None + + start_time = time.time() + + try: + output_df, cache_hit = self._get_or_create_dataframe_from_batch(file_object_batch) + + duration = (time.time() - start_time) * 1000.0 + + if (output_df is not None and logger.isEnabledFor(logging.DEBUG)): + logger.debug("S3 objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms, Rate: %s rows/s", + len(output_df), + "hit" if cache_hit else "miss", + duration, + len(output_df) / (duration / 1000.0)) + + return output_df + except Exception: + logger.exception("Error while converting S3 buckets to DF.") + raise + + def close(self): + """ + Close the resources used by the controller. + """ + self._downloader.close() diff --git a/morpheus/controllers/filter_detections_controller.py b/morpheus/controllers/filter_detections_controller.py new file mode 100644 index 0000000000..c346fab0ae --- /dev/null +++ b/morpheus/controllers/filter_detections_controller.py @@ -0,0 +1,165 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import cupy as cp +import numpy as np +import typing_utils + +from morpheus.common import FilterSource +from morpheus.messages import MultiMessage +from morpheus.messages import MultiResponseMessage + +logger = logging.getLogger(__name__) + + +class FilterDetectionsController: + """ + Controller class for filtering detections based on a specified threshold and source. + + Parameters + ---------- + threshold : float + The threshold value for filtering detections. + filter_source : `morpheus.common.FilterSource` + The source used for filtering. + field_name : str + The name of the field used for filtering. + """ + + def __init__(self, threshold: float, filter_source: FilterSource, field_name: str) -> None: + self._threshold = threshold + self._filter_source = filter_source + self._field_name = field_name + + @property + def threshold(self): + """ + Get the threshold value. + """ + return self._threshold + + @property + def filter_source(self): + """ + Get the filter source. + """ + return self._filter_source + + @property + def field_name(self): + """ + Get the field name. + """ + return self._field_name + + def _find_detections(self, x: MultiMessage) -> typing.Union[cp.ndarray, np.ndarray]: + # Determind the filter source + if self._filter_source == FilterSource.TENSOR: + filter_source = x.get_output(self._field_name) + else: + filter_source = x.get_meta(self._field_name).values + + if (isinstance(filter_source, np.ndarray)): + array_mod = np + else: + array_mod = cp + + # Get per row detections + detections = (filter_source > self._threshold) + + if (len(detections.shape) > 1): + detections = detections.any(axis=1) + + # Surround in False to ensure we get an even number of pairs + detections = array_mod.concatenate([array_mod.array([False]), detections, array_mod.array([False])]) + + return array_mod.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) + + def filter_copy(self, x: MultiMessage) -> MultiMessage: + """ + This function uses a threshold value to filter the messages. + + Parameters + ---------- + x : `morpheus.pipeline.messages.MultiMessage` + Response message with probabilities calculated from inference results. + + Returns + ------- + `morpheus.pipeline.messages.MultiMessage` + A new message containing a copy of the rows above the threshold. + + """ + if x is None: + return None + + true_pairs = self._find_detections(x) + + # If we didnt have any detections, return None + if (true_pairs.shape[0] == 0): + return None + + return x.copy_ranges(true_pairs) + + def filter_slice(self, x: MultiMessage) -> typing.List[MultiMessage]: + """ + This function uses a threshold value to filter the messages. + + Parameters + ---------- + x : `morpheus.pipeline.messages.MultiMessage` + Response message with probabilities calculated from inference results. + + Returns + ------- + typing.List[`morpheus.pipeline.messages.MultiMessage`] + List of filtered messages. + + """ + # Unfortunately we have to convert this to a list in case there are non-contiguous groups + output_list = [] + if x is not None: + true_pairs = self._find_detections(x) + for pair in true_pairs: + pair = tuple(pair.tolist()) + if ((pair[1] - pair[0]) > 0): + output_list.append(x.get_slice(*pair)) + + return output_list + + def update_filter_source(self, message_type: typing.Any): + """ + This function updates filter source. + + Parameters + ---------- + message_type : `typing.Any` + Response message with probabilities calculated from inference results. + """ + + # Unfortunately we have to convert this to a list in case there are non-contiguous groups + if self._filter_source == FilterSource.Auto: + if (typing_utils.issubtype(message_type, MultiResponseMessage)): + self._filter_source = FilterSource.TENSOR + else: + self._filter_source = FilterSource.DATAFRAME + + logger.debug( + "filter_source was set to Auto, inferring a filter source of %s based on an input " + "message type of %s", + self._filter_source, + message_type) diff --git a/morpheus/controllers/mlflow_model_writer_controller.py b/morpheus/controllers/mlflow_model_writer_controller.py new file mode 100644 index 0000000000..dca198ddcb --- /dev/null +++ b/morpheus/controllers/mlflow_model_writer_controller.py @@ -0,0 +1,305 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import logging +import os +import typing +import urllib.parse + +import mlflow +import requests +from mlflow.exceptions import MlflowException +from mlflow.models.signature import ModelSignature +from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS +from mlflow.protos.databricks_pb2 import ErrorCode +from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository +from mlflow.tracking import MlflowClient +from mlflow.types import ColSpec +from mlflow.types import Schema +from mlflow.types.utils import _infer_pandas_column +from mlflow.types.utils import _infer_schema + +import cudf + +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.models.dfencoder import AutoEncoder + +logger = logging.getLogger(__name__) + + +class MLFlowModelWriterController: + """ + Controller class for writing machine learning models to MLflow with optional permissions and configurations. + + Parameters + ---------- + model_name_formatter : str + Model name formatter. + experiment_name_formatter : str + Experiment name formatter. + databricks_permissions : dict + Users with read/write permissions. + conda_env : dict + Conda environment. + timeout : + Timeout for get requests. + timestamp_column_name : + Timestamp column name to be used from the dataframe. + + """ + + def __init__(self, + model_name_formatter, + experiment_name_formatter, + databricks_permissions, + conda_env, + timeout, + timestamp_column_name): + self._model_name_formatter = model_name_formatter + self._experiment_name_formatter = experiment_name_formatter + self._databricks_permissions = databricks_permissions + self._conda_env = conda_env + self._timeout = timeout + self._timestamp_column_name = timestamp_column_name + + @property + def model_name_formatter(self): + return self._model_name_formatter + + @property + def experiment_name_formatter(self): + return self._experiment_name_formatter + + @property + def databricks_permissions(self): + return self._databricks_permissions + + def user_id_to_model(self, user_id: str): + """ + Converts a user ID to an model name + + Parameters + ---------- + user_id : str + The user ID. + + Returns + ------- + str + The generated model name. + """ + + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + } + + return self._model_name_formatter.format(**kwargs) + + def user_id_to_experiment(self, user_id: str) -> str: + """ + Converts a user ID to an experiment name + + Parameters + ---------- + user_id : str + The user ID. + + Returns + ------- + str + The generated experiment name. + """ + + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + "reg_model_name": self.user_id_to_model(user_id=user_id) + } + + return self._experiment_name_formatter.format(**kwargs) + + def _apply_model_permissions(self, reg_model_name: str): + + # Check the required variables + databricks_host = os.environ.get("DATABRICKS_HOST", None) + databricks_token = os.environ.get("DATABRICKS_TOKEN", None) + + if (databricks_host is None or databricks_token is None): + raise RuntimeError("Cannot set Databricks model permissions. " + "Environment variables `DATABRICKS_HOST` and `DATABRICKS_TOKEN` must be set") + + headers = {"Authorization": f"Bearer {databricks_token}"} + + url_base = f"{databricks_host}" + + try: + # First get the registered model ID + get_registered_model_url = urllib.parse.urljoin(url_base, + "/api/2.0/mlflow/databricks/registered-models/get") + + get_registered_model_response = requests.get(url=get_registered_model_url, + headers=headers, + params={"name": reg_model_name}, + timeout=self._timeout) + + registered_model_response = get_registered_model_response.json() + + reg_model_id = registered_model_response["registered_model_databricks"]["id"] + + # Now apply the permissions. If it exists already, it will be overwritten or it is a no-op + patch_registered_model_permissions_url = urllib.parse.urljoin( + url_base, f"/api/2.0/preview/permissions/registered-models/{reg_model_id}") + + patch_registered_model_permissions_body = { + "access_control_list": [{ + "group_name": group, "permission_level": permission + } for group, + permission in self._databricks_permissions.items()] + } + + requests.patch(url=patch_registered_model_permissions_url, + headers=headers, + json=patch_registered_model_permissions_body, + timeout=self._timeout) + + except Exception: + logger.exception("Error occurred trying to apply model permissions to model: %s", + reg_model_name, + exc_info=True) + + def on_data(self, message: MultiAEMessage): + """ + Stores incoming models into MLflow. + + Parameters + ---------- + message : MultiAEMessage + The incoming message containing the model and related metadata. + + Returns + ------- + MultiAEMessage + The processed message. + """ + + user = message.meta.user_id + + model: AutoEncoder = message.model + + model_path = "dfencoder" + reg_model_name = self.user_id_to_model(user_id=user) + + # Write to ML Flow + try: + mlflow.end_run() + + experiment_name = self.user_id_to_experiment(user_id=user) + + # Creates a new experiment if it doesn't exist + experiment = mlflow.set_experiment(experiment_name) + + with mlflow.start_run(run_name="autoencoder model training run", + experiment_id=experiment.experiment_id) as run: + + model_path = f"{model_path}-{run.info.run_uuid}" + + # Log all params in one dict to avoid round trips + mlflow.log_params({ + "Algorithm": "Denosing Autoencoder", + "Epochs": model.lr_decay.state_dict().get("last_epoch", "unknown"), + "Learning rate": model.lr, + "Batch size": model.batch_size, + "Start Epoch": message.get_meta(self._timestamp_column_name).min(), + "End Epoch": message.get_meta(self._timestamp_column_name).max(), + "Log Count": message.mess_count, + }) + + metrics_dict: typing.Dict[str, float] = {} + + # Add info on the embeddings + for key, value in model.categorical_fts.items(): + embedding = value.get("embedding", None) + + if (embedding is None): + continue + + metrics_dict[f"embedding-{key}-num_embeddings"] = embedding.num_embeddings + metrics_dict[f"embedding-{key}-embedding_dim"] = embedding.embedding_dim + + mlflow.log_metrics(metrics_dict) + + # Use the prepare_df function to setup the direct inputs to the model. Only include features returned by + # prepare_df to show the actual inputs to the model (any extra are discarded) + input_df = message.get_meta().iloc[0:1] + + if isinstance(input_df, cudf.DataFrame): + input_df = input_df.to_pandas() + + prepared_df = model.prepare_df(input_df) + output_values = model.get_anomaly_score(input_df) + + input_schema = Schema([ + ColSpec(type=_infer_pandas_column(input_df[col_name]), name=col_name) + for col_name in list(prepared_df.columns) + ]) + output_schema = _infer_schema(output_values) + + model_sig = ModelSignature(inputs=input_schema, outputs=output_schema) + + model_info = mlflow.pytorch.log_model( + pytorch_model=model, + artifact_path=model_path, + conda_env=self._conda_env, + signature=model_sig, + ) + + client = MlflowClient() + + # First ensure a registered model has been created + try: + create_model_response = client.create_registered_model(reg_model_name) + logger.debug("Successfully registered model '%s'.", create_model_response.name) + except MlflowException as e: + if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): + pass + else: + raise e + + # If we are using databricks, make sure we set the correct permissions + if (self._databricks_permissions is not None and mlflow.get_tracking_uri() == "databricks"): + # Need to apply permissions + self._apply_model_permissions(reg_model_name=reg_model_name) + + model_src = RunsArtifactRepository.get_underlying_uri(model_info.model_uri) + + tags = { + "start": message.get_meta(self._timestamp_column_name).min(), + "end": message.get_meta(self._timestamp_column_name).max(), + "count": message.get_meta(self._timestamp_column_name).count() + } + + # Now create the model version + mv_obj = client.create_model_version(name=reg_model_name, + source=model_src, + run_id=run.info.run_id, + tags=tags) + + logger.debug("ML Flow model upload complete: %s:%s:%s", user, reg_model_name, mv_obj.version) + + except Exception: + logger.exception("Error uploading model to ML Flow", exc_info=True) + + return message diff --git a/morpheus/controllers/monitor_controller.py b/morpheus/controllers/monitor_controller.py new file mode 100644 index 0000000000..30940caf7b --- /dev/null +++ b/morpheus/controllers/monitor_controller.py @@ -0,0 +1,235 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing +from functools import reduce + +import fsspec +from tqdm import tqdm + +import cudf + +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.messages import MultiMessage +from morpheus.utils.logger import LogLevels +from morpheus.utils.monitor_utils import MorpheusTqdm + +logger = logging.getLogger(__name__) + + +class MonitorController: + """ + Controls and displays throughput numbers at a specific point in the pipeline. + + Parameters + ---------- + position: int + Specifies the monitor's position on the console. + description : str, default = "Progress" + Name to show for this Monitor Stage in the console window. + smoothing : float + Smoothing parameter to determine how much the throughput should be averaged. 0 = Instantaneous, 1 = + Average. + unit : str + Units to show in the rate value. + delayed_start : bool + When delayed_start is enabled, the progress bar will not be shown until the first message is received. + Otherwise, the progress bar is shown on pipeline startup and will begin timing immediately. In large pipelines, + this option may be desired to give a more accurate timing. + determine_count_fn : typing.Callable[[typing.Any], int] + Custom function for determining the count in a message. Gets called for each message. Allows for + correct counting of batched and sliced messages. + log_level : `morpheus.utils.logger.LogLevels`, default = 'INFO' + Enable this stage when the configured log level is at `log_level` or lower. + tqdm_class: `tqdm`, default = None + Custom implementation of tqdm if required. + """ + + controller_count: int = 0 + + def __init__(self, + position: int, + description: str, + smoothing: float, + unit: str, + delayed_start: bool, + determine_count_fn: typing.Callable[[typing.Any], int], + log_level: LogLevels, + tqdm_class: tqdm = None): + + self._progress: tqdm = None + self._position = position + self._description = description + self._smoothing = smoothing + self._unit = unit + self._delayed_start = delayed_start + self._determine_count_fn = determine_count_fn + self._tqdm_class = tqdm_class if tqdm_class else MorpheusTqdm + + if isinstance(log_level, LogLevels): # pylint: disable=isinstance-second-argument-not-valid-type + log_level = log_level.value + + self._log_level = log_level + self._enabled = None # defined on first call to _is_enabled + + @property + def delayed_start(self) -> bool: + return self._delayed_start + + @property + def progress(self) -> tqdm: + return self._progress + + def is_enabled(self) -> bool: + """ + Returns a boolean indicating whether or not the logger is enabled. + """ + + if self._enabled is None: + self._enabled = logger.isEnabledFor(self._log_level) + + return self._enabled + + def ensure_progress_bar(self): + """ + Ensures that the progress bar is initialized and ready for display. + """ + + if (self._progress is None): + self._progress = self._tqdm_class(desc=self._description, + smoothing=self._smoothing, + dynamic_ncols=True, + unit=(self._unit if self._unit.startswith(" ") else f" {self._unit}"), + mininterval=0.25, + maxinterval=1.0, + miniters=1, + position=self._position) + + self._progress.reset() + + def refresh_progress(self, _): + """ + Refreshes the progress bar display. + """ + self._progress.refresh() + + def progress_sink(self, x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List]): + """ + Receives a message and determines the count of the message. + The progress bar is displayed and the progress is updated. + + Parameters + ---------- + x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] + Message that determines the count of the message + + Returns + ------- + x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] + + """ + + # Make sure the progress bar is shown + self.ensure_progress_bar() + + if (self._determine_count_fn is None): + self._determine_count_fn = self.auto_count_fn(x) + + # Skip incase we have empty objects + if (self._determine_count_fn is None): + return x + + # Do our best to determine the count + count = self._determine_count_fn(x) + + self._progress.update(n=count) + + return x + + def auto_count_fn(self, x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List]): + """ + This is a helper function that is used to determine the count of messages received by the + monitor. + + Parameters + ---------- + x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] + Message that determines the count of the message + + Returns + ------- + Message count. + + """ + + # pylint: disable=too-many-return-statements + + if (x is None): + return None + + # Wait for a list thats not empty + if (isinstance(x, list) and len(x) == 0): + return None + + if (isinstance(x, cudf.DataFrame)): + return lambda y: len(y.index) + + if (isinstance(x, MultiMessage)): + return lambda y: y.mess_count + + if (isinstance(x, MessageMeta)): + return lambda y: y.count + + if isinstance(x, ControlMessage): + + def check_df(y): + df = y.payload().df + if df is not None: + return len(df) + + return 0 + + return check_df + + if (isinstance(x, list)): + item_count_fn = self.auto_count_fn(x[0]) + return lambda y: reduce(lambda sum, z, item_count_fn=item_count_fn: sum + item_count_fn(z), y, 0) + + if (isinstance(x, (str, fsspec.core.OpenFile))): + return lambda y: 1 + + if (hasattr(x, "__len__")): + return len # Return len directly (same as `lambda y: len(y)`) + + raise NotImplementedError(f"Unsupported type: {type(x)}") + + def sink_on_completed(self): + """ + Stops the progress bar and prevents the monitors from writing over each other when the last + stage completes. + """ + + # Set the name to complete. This refreshes the display + self.progress.set_description_str(self.progress.desc + "[Complete]") + + self.progress.stop() + + # To prevent the monitors from writing over eachother, stop the monitor when the last stage completes + MonitorController.controller_count -= 1 + + if (MonitorController.controller_count <= 0 and self._tqdm_class.monitor is not None): + self._tqdm_class.monitor.exit() + self._tqdm_class.monitor = None diff --git a/morpheus/controllers/serialize_controller.py b/morpheus/controllers/serialize_controller.py new file mode 100644 index 0000000000..9750741a76 --- /dev/null +++ b/morpheus/controllers/serialize_controller.py @@ -0,0 +1,135 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import re +import typing + +from morpheus.messages import MessageMeta +from morpheus.messages import MultiMessage + + +class SerializeController: + """ + Controller class for converting data to JSON lines format with customizable column selection and exclusion. + + Parameters + ---------- + include : typing.List[str] + List of columns to include. + exclude : typing.List[str] + List of columns to exclude. + fixed_columns : bool + Flag to indicate whether columns should be fixed. + """ + + def __init__(self, include: typing.List[str], exclude: typing.List[str], fixed_columns: bool): + self._include_columns = copy.copy(include) + self._exclude_columns = copy.copy(exclude) + self._fixed_columns = fixed_columns + self._columns = None + + @property + def include_columns(self): + """ + Get the list of included columns. + """ + return self._include_columns + + @property + def exclude_columns(self): + """ + Get the list of excluded columns. + """ + return self._exclude_columns + + @property + def fixed_columns(self): + """ + Get the flag indicating whether columns are fixed. + """ + return self._fixed_columns + + def convert_to_df(self, + x: MultiMessage, + include_columns: typing.Pattern, + exclude_columns: typing.List[typing.Pattern]): + """ + Converts dataframe to entries to JSON lines. + + Parameters + ---------- + x : `morpheus.pipeline.messages.MultiMessage` + MultiMessage instance that contains data. + include_columns : typing.Pattern + Columns that are required send to downstream stage. + exclude_columns : typing.List[typing.Pattern] + Columns that are not required send to downstream stage. + + """ + + if self._fixed_columns and self._columns is not None: + columns = self._columns + else: + columns: typing.List[str] = [] + + # Minimize access to x.meta.df + df_columns = list(x.meta.df.columns) + + # First build up list of included. If no include regex is specified, select all + if (include_columns is None): + columns = df_columns + else: + columns = [y for y in df_columns if include_columns.match(y)] + + # Now remove by the ignore + for test in exclude_columns: + columns = [y for y in columns if not test.match(y)] + + self._columns = columns + + # Get metadata from columns + df = x.get_meta(columns) + + return MessageMeta(df=df) + + def get_include_col_pattern(self): + """ + Get the compiled pattern for include columns. + + Returns + ------- + typing.Pattern + The compiled pattern for include columns. + """ + + include_columns = None + + if (self._include_columns is not None and len(self._include_columns) > 0): + include_columns = re.compile(f"({'|'.join(self._include_columns)})") + + return include_columns + + def get_exclude_col_pattern(self): + """ + Get the list of compiled patterns for exclude columns. + + Returns + ------- + typing.List[typing.Pattern] + The list of compiled patterns for exclude columns. + """ + exclude_columns = [re.compile(x) for x in self._exclude_columns] + + return exclude_columns diff --git a/morpheus/controllers/write_to_file_controller.py b/morpheus/controllers/write_to_file_controller.py new file mode 100644 index 0000000000..15bc014548 --- /dev/null +++ b/morpheus/controllers/write_to_file_controller.py @@ -0,0 +1,136 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import mrc +import mrc.core.operators as ops + +from morpheus.common import FileTypes +from morpheus.common import determine_file_type +from morpheus.io import serializers +from morpheus.messages import MessageMeta +from morpheus.utils.type_aliases import DataFrameType + + +class WriteToFileController: + """ + Controller class for writing data to a file with customizable options. + + Parameters + ---------- + filename : str + The output file name. + overwrite : bool + Flag to indicate whether to overwrite an existing file. + file_type : FileTypes + The type of the output file (e.g., CSV, JSON). + include_index_col : bool + Flag to indicate whether to include the index column in the output. + flush : bool + Flag to indicate whether to flush the output file after writing. + """ + + def __init__(self, filename: str, overwrite: bool, file_type: FileTypes, include_index_col: bool, flush: bool): + self._output_file = filename + self._overwrite = overwrite + + if (os.path.exists(self._output_file)): + if (self._overwrite): + os.remove(self._output_file) + else: + raise FileExistsError( + f"Cannot output classifications to '{self._output_file}'. File exists and overwrite = False") + + self._file_type = file_type + + if (self._file_type == FileTypes.Auto): + self._file_type = determine_file_type(self._output_file) + + self._is_first = True + self._include_index_col = include_index_col + self._flush = flush + + @property + def output_file(self): + """ + Get the output file name. + """ + return self._output_file + + @property + def overwrite(self): + """ + Get the flag indicating whether to overwrite an existing file. + """ + return self._overwrite + + @property + def file_type(self): + """ + Get the type of the output file. + """ + return self._file_type + + @property + def include_index_col(self): + """ + Get the flag indicating whether to include the index column in the output. + """ + return self._include_index_col + + @property + def flush(self): + """ + Get the flag indicating whether to flush the output file after writing. + """ + return self._flush + + def _convert_to_strings(self, df: DataFrameType): + if self._file_type in (FileTypes.JSON, 'JSON'): + output_strs = serializers.df_to_json(df, include_index_col=self._include_index_col) + elif self._file_type in (FileTypes.CSV, 'CSV'): + output_strs = serializers.df_to_csv(df, + include_header=self._is_first, + include_index_col=self._include_index_col) + self._is_first = False + else: + raise NotImplementedError(f"Unknown file type: {self._file_type}") + + # Remove any trailing whitespace + if (len(output_strs[-1].strip()) == 0): + output_strs = output_strs[:-1] + + return output_strs + + def node_fn(self, obs: mrc.Observable, sub: mrc.Subscriber): + + # Ensure our directory exists + os.makedirs(os.path.realpath(os.path.dirname(self._output_file)), exist_ok=True) + + # Open up the file handle + with open(self._output_file, "a", encoding='UTF-8') as out_file: + + def write_to_file(x: MessageMeta): + + lines = self._convert_to_strings(x.df) + + out_file.writelines(lines) + + if self._flush: + out_file.flush() + + return x + + obs.pipe(ops.map(write_to_file)).subscribe(sub) diff --git a/morpheus/loaders/file_to_df_loader.py b/morpheus/loaders/file_to_df_loader.py index 2169b3f105..ff69d89366 100644 --- a/morpheus/loaders/file_to_df_loader.py +++ b/morpheus/loaders/file_to_df_loader.py @@ -13,28 +13,17 @@ # limitations under the License. """Loader for fetching files and emitting them as DataFrames.""" -import hashlib -import json import logging -import os import pickle -import time -import typing -from functools import partial import fsspec -import fsspec.utils -import pandas as pd import cudf from morpheus.cli.utils import str_to_file_type -from morpheus.common import FileTypes -from morpheus.io.deserializers import read_file_to_df +from morpheus.controllers.file_to_df_controller import FileToDFController from morpheus.messages import ControlMessage from morpheus.messages.message_meta import MessageMeta -from morpheus.utils.column_info import process_dataframe -from morpheus.utils.downloader import Downloader from morpheus.utils.loader_ids import FILE_TO_DF_LOADER from morpheus.utils.loader_utils import register_loader @@ -72,6 +61,8 @@ def file_to_df_loader(control_message: ControlMessage, task: dict): raise RuntimeError("Only 'aggregate' strategy is supported for file_to_df loader.") files = task.get("files", None) + n_groups = task.get("n_groups", None) + config = task["batcher_config"] timestamp_column_name = config.get("timestamp_column_name", "timestamp") @@ -88,14 +79,10 @@ def file_to_df_loader(control_message: ControlMessage, task: dict): parser_kwargs = config.get("parser_kwargs", None) cache_dir = config.get("cache_dir", None) - downloader = Downloader() - if (cache_dir is None): cache_dir = "./.cache" logger.warning("Cache directory not set. Defaulting to ./.cache") - cache_dir = os.path.join(cache_dir, "file_cache") - # Load input schema schema = pickle.loads(bytes(schema_str, encoding)) @@ -104,135 +91,20 @@ def file_to_df_loader(control_message: ControlMessage, task: dict): except Exception as exec_info: raise ValueError(f"Invalid input file type '{file_type}'. Available file types are: CSV, JSON.") from exec_info - def single_object_to_dataframe(file_object: fsspec.core.OpenFile, - file_type: FileTypes, - filter_null: bool, - parser_kwargs: dict): - retries = 0 - s3_df = None - while (retries < 2): - try: - with file_object as f: - s3_df = read_file_to_df(f, - file_type, - filter_nulls=filter_null, - df_type="pandas", - parser_kwargs=parser_kwargs) - break - except Exception as exec_info: - if (retries < 2): - logger.warning("Refreshing S3 credentials") - retries += 1 - else: - raise exec_info - - # Run the pre-processing before returning - if (s3_df is None): - return s3_df - - # Optimistaclly prep the dataframe (Not necessary since this will happen again in process_dataframe, but it - # increases performance significantly) - if (schema.prep_dataframe is not None): - s3_df = schema.prep_dataframe(s3_df) - - return s3_df - - def get_or_create_dataframe_from_s3_batch(file_name_batch: typing.List[str]) -> typing.Tuple[cudf.DataFrame, bool]: - - if (not file_name_batch): - raise RuntimeError("No file objects to process") - - file_list = fsspec.open_files(file_name_batch) - # batch_count = file_name_batch[1] - - file_system: fsspec.AbstractFileSystem = file_list.fs - - # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just - # hashes all the output of `info()` which is perfect - hash_data = [{"ukey": file_system.ukey(file_object.path)} for file_object in file_list] - - # Convert to base 64 encoding to remove - values - objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() - - batch_cache_location = os.path.join(cache_dir, "batches", f"{objects_hash_hex}.pkl") - - # Return the cache if it exists - if (os.path.exists(batch_cache_location)): - output_df = pd.read_pickle(batch_cache_location) - output_df["origin_hash"] = objects_hash_hex - # output_df["batch_count"] = batch_count - - return (output_df, True) - - # Cache miss - download_method_func = partial(single_object_to_dataframe, - file_type=file_type, - filter_null=filter_null, - parser_kwargs=parser_kwargs) - - download_buckets = file_list - - # Loop over dataframes and concat into one - try: - dfs = downloader.download(download_buckets, download_method_func) - except Exception: - logger.exception("Failed to download logs. Error: ", exc_info=True) - raise - - if (dfs is None or len(dfs) == 0): - raise ValueError("No logs were downloaded") - - output_df: pd.DataFrame = pd.concat(dfs) - output_df = process_dataframe(df_in=output_df, input_schema=schema) - - # Finally sort by timestamp and then reset the index - output_df.sort_values(by=[timestamp_column_name], inplace=True) - - output_df.reset_index(drop=True, inplace=True) - - # Save dataframe to cache future runs - os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) - - try: - output_df.to_pickle(batch_cache_location) - except Exception: - logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) - - # output_df["batch_count"] = batch_count - output_df["origin_hash"] = objects_hash_hex - - return (output_df, False) - - def convert_to_dataframe(filenames: typing.List[str]): - - if (not filenames): - return None - - start_time = time.time() - - try: - - output_df, cache_hit = get_or_create_dataframe_from_s3_batch(filenames) - - duration = (time.time() - start_time) * 1000.0 - - if (output_df is not None and logger.isEnabledFor(logging.DEBUG)): - logger.debug("S3 objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms, Rate: %s rows/s", - len(output_df), - "hit" if cache_hit else "miss", - duration, - len(output_df) / (duration / 1000.0)) - - return output_df - except Exception: - logger.exception("Error while converting S3 buckets to DF.") - raise - - pdf = convert_to_dataframe(files) - - df = cudf.from_pandas(pdf) - - # Overwriting payload with derived data - control_message.payload(MessageMeta(df)) + try: + controller = FileToDFController(schema=schema, + filter_null=filter_null, + file_type=file_type, + parser_kwargs=parser_kwargs, + cache_dir=cache_dir, + timestamp_column_name=timestamp_column_name) + pdf = controller.convert_to_dataframe(file_object_batch=(fsspec.open_files(files), n_groups)) + df = cudf.from_pandas(pdf) + + # Overwriting payload with derived data + control_message.payload(MessageMeta(df)) + + finally: + controller.close() return control_message diff --git a/morpheus/modules/file_to_df.py b/morpheus/modules/file_to_df.py index 32c09f8a66..d7c053aef4 100644 --- a/morpheus/modules/file_to_df.py +++ b/morpheus/modules/file_to_df.py @@ -13,28 +13,14 @@ # limitations under the License. """Morpheus pipeline module for fetching files and emitting them as DataFrames.""" -import hashlib -import json import logging -import os import pickle -import time -import typing -from functools import partial -import fsspec -import fsspec.utils import mrc -import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.cli.utils import str_to_file_type -from morpheus.common import FileTypes -from morpheus.io.deserializers import read_file_to_df -from morpheus.utils.column_info import process_dataframe -from morpheus.utils.downloader import Downloader +from morpheus.controllers.file_to_df_controller import FileToDFController from morpheus.utils.module_ids import FILE_TO_DF from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module @@ -80,14 +66,10 @@ def file_to_df(builder: mrc.Builder): parser_kwargs = config.get("parser_kwargs", None) cache_dir = config.get("cache_dir", None) - downloader = Downloader() - if (cache_dir is None): cache_dir = "./.cache" logger.warning("Cache directory not set. Defaulting to ./.cache") - cache_dir = os.path.join(cache_dir, "file_cache") - # Load input schema schema = pickle.loads(bytes(schema_str, encoding)) @@ -96,136 +78,14 @@ def file_to_df(builder: mrc.Builder): except Exception as exec_info: raise ValueError(f"Invalid input file type '{file_type}'. Available file types are: CSV, JSON.") from exec_info - def single_object_to_dataframe(file_object: fsspec.core.OpenFile, - file_type: FileTypes, - filter_null: bool, - parser_kwargs: dict): - - retries = 0 - s3_df = None - while (retries < 2): - try: - with file_object as f: - s3_df = read_file_to_df(f, - file_type, - filter_nulls=filter_null, - df_type="pandas", - parser_kwargs=parser_kwargs) - - break - except Exception as e: - if (retries < 2): - logger.warning("Refreshing S3 credentials") - retries += 1 - else: - raise e - - # Run the pre-processing before returning - if (s3_df is None): - return s3_df - - # Optimistaclly prep the dataframe (Not necessary since this will happen again in process_dataframe, but it - # increases performance significantly) - if (schema.prep_dataframe is not None): - s3_df = schema.prep_dataframe(s3_df) - - return s3_df - - def get_or_create_dataframe_from_s3_batch( - file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: - - if (not file_object_batch): - raise RuntimeError("No file objects to process") - - file_list = file_object_batch[0] - batch_count = file_object_batch[1] - - file_system: fsspec.AbstractFileSystem = file_list.fs - - # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just - # hashes all of the output of `info()` which is perfect - hash_data = [{"ukey": file_system.ukey(file_object.path)} for file_object in file_list] - - # Convert to base 64 encoding to remove - values - objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() - - batch_cache_location = os.path.join(cache_dir, "batches", f"{objects_hash_hex}.pkl") - - # Return the cache if it exists - if (os.path.exists(batch_cache_location)): - output_df = pd.read_pickle(batch_cache_location) - output_df["origin_hash"] = objects_hash_hex - output_df["batch_count"] = batch_count - - return (output_df, True) - - # Cache miss - download_method_func = partial(single_object_to_dataframe, - file_type=file_type, - filter_null=filter_null, - parser_kwargs=parser_kwargs) - - download_buckets = file_list - - # Loop over dataframes and concat into one - try: - dfs = downloader.download(download_buckets, download_method_func) - except Exception: - logger.exception("Failed to download logs. Error: ", exc_info=True) - raise - - if (dfs is None or len(dfs) == 0): - raise ValueError("No logs were downloaded") - - output_df: pd.DataFrame = pd.concat(dfs) - - output_df = process_dataframe(df_in=output_df, input_schema=schema) - - # Finally sort by timestamp and then reset the index - output_df.sort_values(by=[timestamp_column_name], inplace=True) - - output_df.reset_index(drop=True, inplace=True) - - # Save dataframe to cache future runs - os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) - - try: - output_df.to_pickle(batch_cache_location) - except Exception: - logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) - - output_df["batch_count"] = batch_count - output_df["origin_hash"] = objects_hash_hex - - return (output_df, False) - - def convert_to_dataframe(file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]): - if (not file_object_batch): - return None - - start_time = time.time() - - try: - output_df, cache_hit = get_or_create_dataframe_from_s3_batch(file_object_batch) - - duration = (time.time() - start_time) * 1000.0 - - if (output_df is not None and logger.isEnabledFor(logging.DEBUG)): - logger.debug("S3 objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms, Rate: %s rows/s", - len(output_df), - "hit" if cache_hit else "miss", - duration, - len(output_df) / (duration / 1000.0)) - - return output_df - except Exception: - logger.exception("Error while converting S3 buckets to DF.") - raise - - def node_fn(obs: mrc.Observable, sub: mrc.Subscriber): - obs.pipe(ops.map(convert_to_dataframe), ops.on_completed(downloader.close)).subscribe(sub) + controller = FileToDFController(schema=schema, + filter_null=filter_null, + file_type=file_type, + parser_kwargs=parser_kwargs, + cache_dir=cache_dir, + timestamp_column_name=timestamp_column_name) - node = builder.make_node(FILE_TO_DF, mrc.core.operators.build(node_fn)) + node = builder.make_node(FILE_TO_DF, ops.map(controller.convert_to_dataframe), ops.on_completed(controller.close)) # Register input and output port for a module. builder.register_module_input("input", node) diff --git a/morpheus/modules/filter_detections.py b/morpheus/modules/filter_detections.py index e19e54e5d6..41a59639ac 100644 --- a/morpheus/modules/filter_detections.py +++ b/morpheus/modules/filter_detections.py @@ -14,17 +14,13 @@ import logging import pickle -import typing -import cupy as cp import mrc -import numpy as np -import typing_utils from mrc.core import operators as ops +import morpheus._lib.stages as _stages from morpheus.common import FilterSource -from morpheus.messages import MultiMessage -from morpheus.messages.multi_response_message import MultiResponseMessage +from morpheus.controllers.filter_detections_controller import FilterDetectionsController from morpheus.utils.module_ids import FILTER_DETECTIONS from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module @@ -85,6 +81,10 @@ def filter_detections(builder: mrc.Builder): field_name = config.get("field_name", "probs") threshold = config.get("threshold", 0.5) filter_source = config.get("filter_source", "AUTO") + use_cpp = config.get("use_cpp", False) + + filter_source_dict = {"AUTO": FilterSource.Auto, "DATAFRAME": FilterSource.DATAFRAME, "TENSOR": FilterSource.TENSOR} + copy = config.get("copy", True) if ("schema" not in config): @@ -96,100 +96,27 @@ def filter_detections(builder: mrc.Builder): message_type = pickle.loads(bytes(input_message_type, encoding)) - def find_detections(multi_message: MultiMessage, _filter_source) -> typing.Union[cp.ndarray, np.ndarray]: - - # Determind the filter source - if _filter_source == FilterSource.TENSOR: - _filter_source = multi_message.get_output(field_name) - else: - _filter_source = multi_message.get_meta(field_name).values - - if (isinstance(_filter_source, np.ndarray)): - array_mod = np - else: - array_mod = cp - - # Get per row detections - detections = (_filter_source > threshold) - - if (len(detections.shape) > 1): - detections = detections.any(axis=1) - - # Surround in False to ensure we get an even number of pairs - detections = array_mod.concatenate([array_mod.array([False]), detections, array_mod.array([False])]) - - return array_mod.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) - - def filter_copy(multi_message: MultiMessage) -> typing.Union[MultiMessage, None]: - """ - This function uses a threshold value to filter the messages. - - Parameters - ---------- - multi_message : `morpheus.pipeline.messages.MultiMessage` - Response message with probabilities calculated from inference results. - - Returns - ------- - `morpheus.pipeline.messages.MultiMessage` - A new message containing a copy of the rows above the threshold. - - """ - if multi_message is None: - return None + controller = FilterDetectionsController(threshold=threshold, + filter_source=filter_source_dict[filter_source], + field_name=field_name) - true_pairs = find_detections(multi_message, filter_source) + controller.update_filter_source(message_type=message_type) - if (true_pairs.shape[0] == 0): - return None - - return multi_message.copy_ranges(true_pairs) - - def filter_slice(multi_message: MultiMessage) -> typing.List[MultiMessage]: - """ - This function uses a threshold value to filter the messages. - - Parameters - ---------- - multi_message : `morpheus.pipeline.messages.MultiMessage` - Response message with probabilities calculated from inference results. - - Returns - ------- - typing.List[`morpheus.pipeline.messages.MultiMessage`] - List of filtered messages. - - """ - - # Unfortunately we have to convert this to a list in case there are non-contiguous groups - output_list = [] - if multi_message is not None: - true_pairs = find_detections(multi_message, filter_source) - for pair in true_pairs: - pair = tuple(pair.tolist()) - if ((pair[1] - pair[0]) > 0): - output_list.append(multi_message.get_slice(*pair)) - - return output_list - - if filter_source == "AUTO": - if (typing_utils.issubtype(message_type, MultiResponseMessage)): - filter_source = FilterSource.TENSOR - else: - filter_source = FilterSource.DATAFRAME - - # logger.debug(f"filter_source was set to Auto, infering a filter source of {filter_source} based on an input " - # "message type of {message_type}") - elif filter_source == "DATAFRAME": - filter_source = FilterSource.DATAFRAME + if use_cpp: + node = _stages.FilterDetectionsStage(builder, + FILTER_DETECTIONS, + controller.threshold, + copy, + controller.filter_source, + controller.field_name) else: - raise RuntimeError(f"Unknown filter source: {filter_source}") - - if copy: - node = builder.make_node(FILTER_DETECTIONS, ops.map(filter_copy)) - else: - # Convert list returned by `filter_slice` back to individual messages - node = builder.make_node(FILTER_DETECTIONS, ops.map(filter_slice), ops.flatten()) + if copy: + node = builder.make_node(FILTER_DETECTIONS, + ops.map(controller.filter_copy), + ops.filter(lambda x: x is not None)) + else: + # Convert list returned by `filter_slice` back to individual messages + node = builder.make_node(FILTER_DETECTIONS, ops.map(controller.filter_slice), ops.flatten()) # Register input and output port for a module. builder.register_module_input("input", node) diff --git a/morpheus/modules/mlflow_model_writer.py b/morpheus/modules/mlflow_model_writer.py index 4facb7c0ba..6c842d64d3 100644 --- a/morpheus/modules/mlflow_model_writer.py +++ b/morpheus/modules/mlflow_model_writer.py @@ -12,29 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import hashlib import logging -import os -import typing -import urllib.parse -import mlflow import mrc -import requests -from mlflow.exceptions import MlflowException -from mlflow.models.signature import ModelSignature -from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS -from mlflow.protos.databricks_pb2 import ErrorCode -from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository -from mlflow.tracking import MlflowClient -from mlflow.types import ColSpec -from mlflow.types import Schema -from mlflow.types.utils import _infer_pandas_column -from mlflow.types.utils import _infer_schema from mrc.core import operators as ops -from morpheus.messages.multi_ae_message import MultiAEMessage -from morpheus.models.dfencoder import AutoEncoder +from morpheus.controllers.mlflow_model_writer_controller import MLFlowModelWriterController from morpheus.utils.module_ids import MLFLOW_MODEL_WRITER from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module @@ -62,7 +45,7 @@ def mlflow_model_writer(builder: mrc.Builder): - model_name_formatter (str): Formatter for the model name; Example: `model_name_{timestamp}`; Default: `[Required]` - timestamp_column_name (str): Name of the timestamp column; Example: `timestamp`; Default: timestamp - - source (str): from source where the logs are generated; Example: `azure`; Default: `[Required]` + - timeout (float): Timeout for get requests. databricks_permissions: - read (array): List of users with read permissions; Example: `["read_user1", "read_user2"]`; Default: - @@ -71,11 +54,9 @@ def mlflow_model_writer(builder: mrc.Builder): config = builder.get_current_module_config() + timeout = config.get("timeout", 1.0) timestamp_column_name = config.get("timestamp_column_name", "timestamp") - if ("source" not in config): - raise ValueError("Source is required") - if ("model_name_formatter" not in config): raise ValueError("Model name formatter is required") @@ -85,190 +66,21 @@ def mlflow_model_writer(builder: mrc.Builder): if ("conda_env" not in config): raise ValueError("Conda environment is required") - source = config["source"] model_name_formatter = config["model_name_formatter"] experiment_name_formatter = config["experiment_name_formatter"] conda_env = config.get("conda_env", None) databricks_permissions = config.get("databricks_permissions", None) - def user_id_to_model(user_id: str): - - kwargs = { - "user_id": user_id, - "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), - } - - return model_name_formatter.format(**kwargs) - - def user_id_to_experiment(user_id: str): - - kwargs = { - "user_id": user_id, - "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), - "reg_model_name": user_id_to_model(user_id=user_id) - } - - return experiment_name_formatter.format(**kwargs) - - def apply_model_permissions(reg_model_name: str): - - # Check the required variables - databricks_host = os.environ.get("DATABRICKS_HOST", None) - databricks_token = os.environ.get("DATABRICKS_TOKEN", None) - - if (databricks_host is None or databricks_token is None): - raise RuntimeError("Cannot set Databricks model permissions. " - "Environment variables `DATABRICKS_HOST` and `DATABRICKS_TOKEN` must be set") - - headers = {"Authorization": f"Bearer {databricks_token}"} - - url_base = f"{databricks_host}" - - try: - # First get the registered model ID - get_registered_model_url = urllib.parse.urljoin(url_base, - "/api/2.0/mlflow/databricks/registered-models/get") - - # Remove once https://github.com/nv-morpheus/Morpheus/issues/1050 is resolved - # pylint: disable=missing-timeout - get_registered_model_response = requests.get(url=get_registered_model_url, - headers=headers, - params={"name": reg_model_name}) - - registered_model_response = get_registered_model_response.json() - - reg_model_id = registered_model_response["registered_model_databricks"]["id"] - - # Now apply the permissions. If it exists already, it will be overwritten or it is a no-op - patch_registered_model_permissions_url = urllib.parse.urljoin( - url_base, f"/api/2.0/preview/permissions/registered-models/{reg_model_id}") - - patch_registered_model_permissions_body = { - "access_control_list": [{ - "group_name": group, "permission_level": permission - } for group, - permission in databricks_permissions.items()] - } - - requests.patch(url=patch_registered_model_permissions_url, - headers=headers, - json=patch_registered_model_permissions_body) - - except Exception: - logger.exception("Error occurred trying to apply model permissions to model: %s", - reg_model_name, - exc_info=True) - - def on_data(message: MultiAEMessage): - - user = message.meta.user_id - - model: AutoEncoder = message.model - - model_path = "dfencoder" - reg_model_name = user_id_to_model(user_id=user) - - # Write to ML Flow - try: - mlflow.end_run() - - experiment_name = user_id_to_experiment(user_id=user) - - # Creates a new experiment if it doesnt exist - experiment = mlflow.set_experiment(experiment_name) - - with mlflow.start_run(run_name=f"{source} autoencoder model training run", - experiment_id=experiment.experiment_id) as run: - - model_path = f"{model_path}-{run.info.run_uuid}" - - # Log all params in one dict to avoid round trips - mlflow.log_params({ - "Algorithm": "Denosing Autoencoder", - "Epochs": model.lr_decay.state_dict().get("last_epoch", "unknown"), - "Learning rate": model.lr, - "Batch size": model.batch_size, - "Start Epoch": message.get_meta("timestamp").min(), - "End Epoch": message.get_meta("timestamp").max(), - "Log Count": message.mess_count, - }) - - metrics_dict: typing.Dict[str, float] = {} - - # Add info on the embeddings - for k, val in model.categorical_fts.items(): - embedding = val.get("embedding", None) - - if (embedding is None): - continue - - metrics_dict[f"embedding-{k}-num_embeddings"] = embedding.num_embeddings - metrics_dict[f"embedding-{k}-embedding_dim"] = embedding.embedding_dim - - mlflow.log_metrics(metrics_dict) - - # Use the prepare_df function to setup the direct inputs to the model. Only include features - # returned by prepare_df to show the actual inputs to the model (any extra are discarded) - input_df = message.get_meta().iloc[0:1].to_pandas() - prepared_df = model.prepare_df(input_df) - output_values = model.get_anomaly_score(input_df) - - input_schema = Schema([ - ColSpec(type=_infer_pandas_column(input_df[col_name]), name=col_name) - for col_name in list(prepared_df.columns) - ]) - output_schema = _infer_schema(output_values) - - model_sig = ModelSignature(inputs=input_schema, outputs=output_schema) - - model_info = mlflow.pytorch.log_model( - pytorch_model=model, - artifact_path=model_path, - conda_env=conda_env, - signature=model_sig, - ) - - client = MlflowClient() - - # First ensure a registered model has been created - try: - create_model_response = client.create_registered_model(reg_model_name) - logger.debug("Successfully registered model '%s'.", create_model_response.name) - except MlflowException as e: - if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): - pass - else: - raise e - - # If we are using databricks, make sure we set the correct permissions - if (databricks_permissions is not None and mlflow.get_tracking_uri() == "databricks"): - # Need to apply permissions - apply_model_permissions(reg_model_name=reg_model_name) - - model_src = RunsArtifactRepository.get_underlying_uri(model_info.model_uri) - - tags = { - "start": message.get_meta(timestamp_column_name).min(), - "end": message.get_meta(timestamp_column_name).max(), - "count": message.get_meta(timestamp_column_name).count() - } - - # Now create the model version - model_ver = client.create_model_version(name=reg_model_name, - source=model_src, - run_id=run.info.run_id, - tags=tags) - - logger.debug("ML Flow model upload complete: %s:%s:%s", user, reg_model_name, model_ver.version) - - except Exception: - logger.exception("Error uploading model to ML Flow", exc_info=True) - - return message + controller = MLFlowModelWriterController(model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter, + databricks_permissions=databricks_permissions, + conda_env=conda_env, + timeout=timeout, + timestamp_column_name=timestamp_column_name) def node_fn(obs: mrc.Observable, sub: mrc.Subscriber): - obs.pipe(ops.map(on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + obs.pipe(ops.map(controller.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) node = builder.make_node(MLFLOW_MODEL_WRITER, mrc.core.operators.build(node_fn)) diff --git a/morpheus/modules/serialize.py b/morpheus/modules/serialize.py index 3263e33759..9fd8b4bd31 100644 --- a/morpheus/modules/serialize.py +++ b/morpheus/modules/serialize.py @@ -13,17 +13,11 @@ # limitations under the License. import logging -import re -import typing from functools import partial import mrc -import pandas as pd -import cudf - -from morpheus.messages import MultiMessage -from morpheus.messages.message_meta import MessageMeta +from morpheus.controllers.serialize_controller import SerializeController from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import SERIALIZE from morpheus.utils.module_utils import register_module @@ -58,64 +52,17 @@ def serialize(builder: mrc.Builder): config = builder.get_current_module_config() - include_columns = config.get("include", None) - exclude_columns = config.get("exclude", [r'^ID$', r'^_ts_']) + include = config.get("include", None) + exclude = config.get("exclude", [r'^ID$', r'^_ts_']) fixed_columns = config.get("fixed_columns", True) - columns = config.get("columns", None) - use_cpp = config.get("use_cpp", False) - - def convert_to_df(x: MultiMessage, - include_columns: typing.Pattern, - exclude_columns: typing.List[typing.Pattern], - columns: typing.List[str]): - """ - Converts dataframe to entries to JSON lines. - - Parameters - ---------- - x : `morpheus.pipeline.messages.MultiMessage` - MultiMessage instance that contains data. - include_columns : typing.Pattern - Columns that are required send to downstream stage. - exclude_columns : typing.List[typing.Pattern] - Columns that are not required send to downstream stage. - columns : typing.List[str] - Explicit list of columns to include, if not `None` and `fixed_columns` is `True`, then `include_columns` - and `exclude_columns` will be ignored. - """ - - if (not fixed_columns or columns is None): - columns: typing.List[str] = [] - - # Minimize access to x.meta.df - df_columns = list(x.meta.df.columns) - - # First build up list of included. If no include regex is specified, select all - if (include_columns is None): - columns = df_columns - else: - columns = [y for y in df_columns if include_columns.match(y)] - - # Now remove by the ignore - for test in exclude_columns: - columns = [y for y in columns if not test.match(y)] - - # Get metadata from columns - df = x.get_meta(columns) - - if (isinstance(df, pd.DataFrame) and use_cpp): - df = cudf.from_pandas(df) - - return MessageMeta(df=df) - if (include_columns is not None and len(include_columns) > 0): - include_columns = re.compile(f"({'|'.join(include_columns)})") + controller = SerializeController(include=include, exclude=exclude, fixed_columns=fixed_columns) - exclude_columns = [re.compile(x) for x in exclude_columns] + include_columns = controller.get_include_col_pattern() + exclude_columns = controller.get_exclude_col_pattern() node = builder.make_node( - SERIALIZE, - partial(convert_to_df, include_columns=include_columns, exclude_columns=exclude_columns, columns=columns)) + SERIALIZE, partial(controller.convert_to_df, include_columns=include_columns, exclude_columns=exclude_columns)) # Register input and output port for a module. builder.register_module_input("input", node) diff --git a/morpheus/modules/write_to_file.py b/morpheus/modules/write_to_file.py index 5067bb45b8..c2a7b0b9b2 100644 --- a/morpheus/modules/write_to_file.py +++ b/morpheus/modules/write_to_file.py @@ -14,19 +14,11 @@ """To File Sink Module.""" import logging -import os -import typing import mrc -import pandas as pd -from mrc.core import operators as ops - -import cudf from morpheus.common import FileTypes -from morpheus.common import determine_file_type -from morpheus.io import serializers -from morpheus.messages.message_meta import MessageMeta +from morpheus.controllers.write_to_file_controller import WriteToFileController from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import WRITE_TO_FILE from morpheus.utils.module_utils import register_module @@ -55,67 +47,19 @@ def write_to_file(builder: mrc.Builder): """ config = builder.get_current_module_config() - output_file = config.get("filename", None) + filename = config.get("filename", None) overwrite = config.get("overwrite", False) flush = config.get("flush", False) file_type = config.get("file_type", FileTypes.Auto) include_index_col = config.get("include_index_col", True) - is_first = True - - if (os.path.exists(output_file)): - if (overwrite): - os.remove(output_file) - else: - raise FileExistsError( - f"Cannot output classifications to '{output_file}'. File exists and overwrite = False") - - if (file_type == FileTypes.Auto): - file_type = determine_file_type(output_file) - - def convert_to_strings(df: typing.Union[pd.DataFrame, cudf.DataFrame]): - nonlocal is_first - - if (file_type == FileTypes.JSON): - output_strs = serializers.df_to_json(df, include_index_col=include_index_col) - elif (file_type == FileTypes.CSV): - output_strs = serializers.df_to_csv(df, include_header=is_first, include_index_col=include_index_col) - else: - raise NotImplementedError(f"Unknown file type: {file_type}") - - is_first = False - - # Remove any trailing whitespace - if (len(output_strs[-1].strip()) == 0): - output_strs = output_strs[:-1] - - return output_strs - - # Sink to file - - def node_fn(obs: mrc.Observable, sub: mrc.Subscriber): - - # Ensure our directory exists - os.makedirs(os.path.realpath(os.path.dirname(output_file)), exist_ok=True) - - # Open up the file handle - with open(output_file, "a", encoding='UTF-8') as out_file: - - def _write_to_file(x: MessageMeta): - lines = convert_to_strings(x.df) - - out_file.writelines(lines) - - if flush: - out_file.flush() - - return x - - obs.pipe(ops.map(_write_to_file)).subscribe(sub) - - # File should be closed by here + controller = WriteToFileController(filename=filename, + overwrite=overwrite, + file_type=file_type, + include_index_col=include_index_col, + flush=flush) - node = builder.make_node(WRITE_TO_FILE, mrc.core.operators.build(node_fn)) + node = builder.make_node(WRITE_TO_FILE, mrc.core.operators.build(controller.node_fn)) # Register input and output port for a module. builder.register_module_input("input", node) diff --git a/morpheus/stages/general/monitor_stage.py b/morpheus/stages/general/monitor_stage.py index c3e318bcd6..66b6118407 100644 --- a/morpheus/stages/general/monitor_stage.py +++ b/morpheus/stages/general/monitor_stage.py @@ -21,10 +21,10 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.controllers.monitor_controller import MonitorController from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stream_pair import StreamPair from morpheus.utils.logger import LogLevels -from morpheus.utils.monitor_utils import MonitorController logger = logging.getLogger(__name__) diff --git a/morpheus/stages/output/write_to_file_stage.py b/morpheus/stages/output/write_to_file_stage.py index c6405587e8..a23468a418 100644 --- a/morpheus/stages/output/write_to_file_stage.py +++ b/morpheus/stages/output/write_to_file_stage.py @@ -13,7 +13,6 @@ # limitations under the License. """Write to file stage.""" -import os import typing import mrc @@ -22,13 +21,11 @@ import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FileTypes -from morpheus.common import determine_file_type from morpheus.config import Config -from morpheus.io import serializers +from morpheus.controllers.write_to_file_controller import WriteToFileController from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stream_pair import StreamPair -from morpheus.utils.type_aliases import DataFrameType @register_stage("to-file", rename_options={"include_index_col": "--include-index-col"}) @@ -65,24 +62,11 @@ def __init__(self, super().__init__(c) - self._output_file = filename - self._overwrite = overwrite - - if (os.path.exists(self._output_file)): - if (self._overwrite): - os.remove(self._output_file) - else: - raise FileExistsError( - f"Cannot output classifications to '{self._output_file}'. File exists and overwrite = False") - - self._file_type = file_type - - if (self._file_type == FileTypes.Auto): - self._file_type = determine_file_type(self._output_file) - - self._is_first = True - self._include_index_col = include_index_col - self._flush = flush + self._controller = WriteToFileController(filename=filename, + overwrite=overwrite, + file_type=file_type, + include_index_col=include_index_col, + flush=flush) @property def name(self) -> str: @@ -105,23 +89,6 @@ def supports_cpp_node(self): """Indicates whether this stage supports a C++ node.""" return True - def _convert_to_strings(self, df: DataFrameType): - if (self._file_type == FileTypes.JSON): - output_strs = serializers.df_to_json(df, include_index_col=self._include_index_col) - elif (self._file_type == FileTypes.CSV): - output_strs = serializers.df_to_csv(df, - include_header=self._is_first, - include_index_col=self._include_index_col) - self._is_first = False - else: - raise NotImplementedError(f"Unknown file type: {self._file_type}") - - # Remove any trailing whitespace - if (len(output_strs[-1].strip()) == 0): - output_strs = output_strs[:-1] - - return output_strs - def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair: stream = input_stream[0] @@ -130,37 +97,14 @@ def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> Strea if (self._build_cpp_node()): to_file = _stages.WriteToFileStage(builder, self.unique_name, - self._output_file, + self._controller.output_file, "w", - self._file_type, - self._include_index_col, - self._flush) + self._controller.file_type, + self._controller.include_index_col, + self._controller.flush) else: - def node_fn(obs: mrc.Observable, sub: mrc.Subscriber): - - # Ensure our directory exists - os.makedirs(os.path.realpath(os.path.dirname(self._output_file)), exist_ok=True) - - # Open up the file handle - with open(self._output_file, "a", encoding='UTF-8') as out_file: - - def write_to_file(x: MessageMeta): - - lines = self._convert_to_strings(x.df) - - out_file.writelines(lines) - - if self._flush: - out_file.flush() - - return x - - obs.pipe(ops.map(write_to_file)).subscribe(sub) - - # File should be closed by here - - to_file = builder.make_node(self.unique_name, ops.build(node_fn)) + to_file = builder.make_node(self.unique_name, ops.build(self._controller.node_fn)) builder.make_edge(stream, to_file) stream = to_file diff --git a/morpheus/stages/postprocess/filter_detections_stage.py b/morpheus/stages/postprocess/filter_detections_stage.py index fb24c7f142..2682300e68 100644 --- a/morpheus/stages/postprocess/filter_detections_stage.py +++ b/morpheus/stages/postprocess/filter_detections_stage.py @@ -15,16 +15,14 @@ import logging import typing -import cupy as cp import mrc -import numpy as np -import typing_utils from mrc.core import operators as ops import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FilterSource from morpheus.config import Config +from morpheus.controllers.filter_detections_controller import FilterDetectionsController from morpheus.messages import MultiMessage from morpheus.messages import MultiResponseMessage from morpheus.pipeline.single_port_stage import SinglePortStage @@ -85,12 +83,10 @@ def __init__(self, field_name: str = "probs"): super().__init__(c) - # Probability to consider a detection - self._threshold = threshold self._copy = copy - - self._filter_source = filter_source - self._field_name = field_name + self._controller = FilterDetectionsController(threshold=threshold, + filter_source=filter_source, + field_name=field_name) @property def name(self) -> str: @@ -106,7 +102,7 @@ def accepted_types(self) -> typing.Tuple: Accepted input types. """ - if self._filter_source == FilterSource.TENSOR: + if self._controller.filter_source == FilterSource.TENSOR: return (MultiResponseMessage, ) return (MultiMessage, ) @@ -115,109 +111,27 @@ def supports_cpp_node(self): # Enable support by default return True - def _find_detections(self, x: MultiMessage) -> typing.Union[cp.ndarray, np.ndarray]: - # Determind the filter source - if self._filter_source == FilterSource.TENSOR: - filter_source = x.get_output(self._field_name) - else: - filter_source = x.get_meta(self._field_name).values - - if (isinstance(filter_source, np.ndarray)): - array_mod = np - else: - array_mod = cp - - # Get per row detections - detections = (filter_source > self._threshold) - - if (len(detections.shape) > 1): - detections = detections.any(axis=1) - - # Surround in False to ensure we get an even number of pairs - detections = array_mod.concatenate([array_mod.array([False]), detections, array_mod.array([False])]) - - return array_mod.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) - - def filter_copy(self, x: MultiMessage) -> MultiMessage: - """ - This function uses a threshold value to filter the messages. - - Parameters - ---------- - x : `morpheus.pipeline.messages.MultiMessage` - Response message with probabilities calculated from inference results. - - Returns - ------- - `morpheus.pipeline.messages.MultiMessage` - A new message containing a copy of the rows above the threshold. - - """ - if x is None: - return None - - true_pairs = self._find_detections(x) - - # If we didnt have any detections, return None - if (true_pairs.shape[0] == 0): - return None - - return x.copy_ranges(true_pairs) - - def filter_slice(self, x: MultiMessage) -> typing.List[MultiMessage]: - """ - This function uses a threshold value to filter the messages. - - Parameters - ---------- - x : `morpheus.pipeline.messages.MultiMessage` - Response message with probabilities calculated from inference results. - - Returns - ------- - typing.List[`morpheus.pipeline.messages.MultiMessage`] - List of filtered messages. - - """ - # Unfortunately we have to convert this to a list in case there are non-contiguous groups - output_list = [] - if x is not None: - true_pairs = self._find_detections(x) - for pair in true_pairs: - pair = tuple(pair.tolist()) - if ((pair[1] - pair[0]) > 0): - output_list.append(x.get_slice(*pair)) - - return output_list - def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair: (parent_node, message_type) = input_stream - if self._filter_source == FilterSource.Auto: - if (typing_utils.issubtype(message_type, MultiResponseMessage)): - self._filter_source = FilterSource.TENSOR - else: - self._filter_source = FilterSource.DATAFRAME - logger.debug(("filter_source was set to Auto, inferring a filter source of %s based on an input " - "message type of %s"), - self._filter_source, - message_type) + self._controller.update_filter_source(message_type=message_type) if self._build_cpp_node(): node = _stages.FilterDetectionsStage(builder, self.unique_name, - self._threshold, + self._controller.threshold, self._copy, - self._filter_source, - self._field_name) + self._controller.filter_source, + self._controller.field_name) else: + if self._copy: node = builder.make_node(self.unique_name, - ops.map(self.filter_copy), + ops.map(self._controller.filter_copy), ops.filter(lambda x: x is not None)) else: # Use `ops.flatten` to convert the list returned by `filter_slice` back to individual messages - node = builder.make_node(self.unique_name, ops.map(self.filter_slice), ops.flatten()) + node = builder.make_node(self.unique_name, ops.map(self._controller.filter_slice), ops.flatten()) builder.make_edge(parent_node, node) diff --git a/morpheus/stages/postprocess/serialize_stage.py b/morpheus/stages/postprocess/serialize_stage.py index 7c421c8bf0..9f72426aa5 100644 --- a/morpheus/stages/postprocess/serialize_stage.py +++ b/morpheus/stages/postprocess/serialize_stage.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -import re import typing from functools import partial @@ -23,6 +21,7 @@ import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.controllers.serialize_controller import SerializeController from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage from morpheus.pipeline.single_port_stage import SinglePortStage @@ -62,11 +61,7 @@ def __init__(self, if (exclude is None): exclude = [r'^ID$', r'^_ts_'] - # Make copies of the arrays to prevent changes after the Regex is compiled - self._include_columns = copy.copy(include) - self._exclude_columns = copy.copy(exclude) - self._fixed_columns = fixed_columns - self._columns = None + self._controller = SerializeController(include=include, exclude=exclude, fixed_columns=fixed_columns) @property def name(self) -> str: @@ -88,67 +83,23 @@ def supports_cpp_node(self): # Enable support by default return True - def convert_to_df(self, - x: MultiMessage, - include_columns: typing.Pattern, - exclude_columns: typing.List[typing.Pattern]): - """ - Converts dataframe to entries to JSON lines. - - Parameters - ---------- - x : `morpheus.pipeline.messages.MultiMessage` - MultiMessage instance that contains data. - include_columns : typing.Pattern - Columns that are required send to downstream stage. - exclude_columns : typing.List[typing.Pattern] - Columns that are not required send to downstream stage. - - """ - - if self._fixed_columns and self._columns is not None: - columns = self._columns - else: - columns: typing.List[str] = [] - - # Minimize access to x.meta.df - df_columns = list(x.meta.df.columns) - - # First build up list of included. If no include regex is specified, select all - if (include_columns is None): - columns = df_columns - else: - columns = [y for y in df_columns if include_columns.match(y)] - - # Now remove by the ignore - for test in exclude_columns: - columns = [y for y in columns if not test.match(y)] - - self._columns = columns - - # Get metadata from columns - df = x.get_meta(columns) - - return MessageMeta(df=df) - def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair: if (self._build_cpp_node()): stream = _stages.SerializeStage(builder, self.unique_name, - self._include_columns or [], - self._exclude_columns, - self._fixed_columns) + self._controller.include_columns or [], + self._controller.exclude_columns, + self._controller.fixed_columns) else: - include_columns = None - - if (self._include_columns is not None and len(self._include_columns) > 0): - include_columns = re.compile(f"({'|'.join(self._include_columns)})") - - exclude_columns = [re.compile(x) for x in self._exclude_columns] + include_columns = self._controller.get_include_col_pattern() + exclude_columns = self._controller.get_exclude_col_pattern() stream = builder.make_node( self.unique_name, - ops.map(partial(self.convert_to_df, include_columns=include_columns, exclude_columns=exclude_columns))) + ops.map( + partial(self._controller.convert_to_df, + include_columns=include_columns, + exclude_columns=exclude_columns))) builder.make_edge(input_stream[0], stream) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 2ca7078a38..80f7e69694 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -582,6 +582,22 @@ def _process_column(self, df: pd.DataFrame) -> pd.Series: return increment_col.astype(self.get_pandas_dtype()) +@dataclasses.dataclass +class PreparedDFInfo: + """ + Represents the result of preparing a DataFrame along with avilable columns to be preserved. + + Attributes + ---------- + df : typing.Union[pd.DataFrame, cudf.DataFrame] + The prepared DataFrame. + columns_to_preserve : typing.List[str] + A list of column names that are to be preserved. + """ + df: typing.Union[pd.DataFrame, cudf.DataFrame] + columns_to_preserve: typing.List[str] + + def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], input_columns: dict[str, str], json_cols: list[str], @@ -607,9 +623,14 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], The processed DataFrame. """ + columns_to_preserve = set() + + if (preserve_re): + columns_to_preserve.update(col for col in df_input.columns if re.match(preserve_re, col)) + # Early exit if (json_cols is None or len(json_cols) == 0): - return df_input + return PreparedDFInfo(df=df_input, columns_to_preserve=list(columns_to_preserve)) # Check if we even have any JSON columns to flatten if (not df_input.columns.intersection(json_cols).empty): @@ -620,9 +641,9 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], df_input = df_input.to_pandas() json_normalized = [] - cols_to_keep = list(df_input.columns) + columns_to_keep = list(df_input.columns) for col in json_cols: - if (col not in cols_to_keep): + if (col not in columns_to_keep): continue pd_series = df_input[col] @@ -639,12 +660,11 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], json_normalized.append(pdf_norm) - # Remove from the list of remaining columns if (preserve_re is None or not preserve_re.match(col)): - cols_to_keep.remove(col) + columns_to_keep.remove(col) # Combine the original DataFrame with the normalized JSON columns - df_input = pd.concat([df_input[cols_to_keep]] + json_normalized, axis=1) + df_input = pd.concat([df_input[columns_to_keep]] + json_normalized, axis=1) if (convert_to_cudf): df_input = cudf.from_pandas(df_input).reset_index(drop=True) @@ -654,7 +674,7 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], df_input = df_input.astype(input_columns) - return df_input + return PreparedDFInfo(df=df_input, columns_to_preserve=list(columns_to_preserve)) def _resolve_json_output_columns(json_cols: list[str], input_cols: dict[str, str]) -> list[tuple[str, str]]: diff --git a/morpheus/utils/monitor_utils.py b/morpheus/utils/monitor_utils.py index e37567d692..586d0730d2 100644 --- a/morpheus/utils/monitor_utils.py +++ b/morpheus/utils/monitor_utils.py @@ -13,21 +13,11 @@ # limitations under the License. import logging -import typing -from functools import reduce -import fsspec from tqdm import TMonitor from tqdm import TqdmSynchronisationWarning from tqdm import tqdm -import cudf - -from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta -from morpheus.messages import MultiMessage -from morpheus.utils.logger import LogLevels - logger = logging.getLogger(__name__) @@ -144,208 +134,3 @@ class SilentMorpheusTqdm(MorpheusTqdm): def refresh(self, nolock=False, lock_args=None): return - - -class MonitorController: - """ - Controls and displays throughput numbers at a specific point in the pipeline. - - Parameters - ---------- - position: int - Specifies the monitor's position on the console. - description : str, default = "Progress" - Name to show for this Monitor Stage in the console window. - smoothing : float - Smoothing parameter to determine how much the throughput should be averaged. 0 = Instantaneous, 1 = - Average. - unit : str - Units to show in the rate value. - delayed_start : bool - When delayed_start is enabled, the progress bar will not be shown until the first message is received. - Otherwise, the progress bar is shown on pipeline startup and will begin timing immediately. In large pipelines, - this option may be desired to give a more accurate timing. - determine_count_fn : typing.Callable[[typing.Any], int] - Custom function for determining the count in a message. Gets called for each message. Allows for - correct counting of batched and sliced messages. - log_level : `morpheus.utils.logger.LogLevels`, default = 'INFO' - Enable this stage when the configured log level is at `log_level` or lower. - tqdm_class: `tqdm`, default = None - Custom implementation of tqdm if required. - """ - - controller_count: int = 0 - - def __init__(self, - position: int, - description: str, - smoothing: float, - unit: str, - delayed_start: bool, - determine_count_fn: typing.Callable[[typing.Any], int], - log_level: LogLevels, - tqdm_class: tqdm = None): - - self._progress: tqdm = None - self._position = position - self._description = description - self._smoothing = smoothing - self._unit = unit - self._delayed_start = delayed_start - self._determine_count_fn = determine_count_fn - self._tqdm_class = tqdm_class if tqdm_class else MorpheusTqdm - - if isinstance(log_level, LogLevels): # pylint: disable=isinstance-second-argument-not-valid-type - log_level = log_level.value - - self._log_level = log_level - self._enabled = None # defined on first call to _is_enabled - - @property - def delayed_start(self) -> bool: - return self._delayed_start - - @property - def progress(self) -> tqdm: - return self._progress - - def is_enabled(self) -> bool: - """ - Returns a boolean indicating whether or not the logger is enabled. - """ - - if self._enabled is None: - self._enabled = logger.isEnabledFor(self._log_level) - - return self._enabled - - def ensure_progress_bar(self): - """ - Ensures that the progress bar is initialized and ready for display. - """ - - if (self._progress is None): - self._progress = self._tqdm_class(desc=self._description, - smoothing=self._smoothing, - dynamic_ncols=True, - unit=(self._unit if self._unit.startswith(" ") else f" {self._unit}"), - mininterval=0.25, - maxinterval=1.0, - miniters=1, - position=self._position) - - self._progress.reset() - - def refresh_progress(self, _): - """ - Refreshes the progress bar display. - """ - self._progress.refresh() - - def progress_sink(self, x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List]): - """ - Receives a message and determines the count of the message. - The progress bar is displayed and the progress is updated. - - Parameters - ---------- - x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] - Message that determines the count of the message - - Returns - ------- - x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] - - """ - - # Make sure the progress bar is shown - self.ensure_progress_bar() - - if (self._determine_count_fn is None): - self._determine_count_fn = self.auto_count_fn(x) - - # Skip incase we have empty objects - if (self._determine_count_fn is None): - return x - - # Do our best to determine the count - count = self._determine_count_fn(x) - - self._progress.update(n=count) - - return x - - def auto_count_fn(self, x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List]): - """ - This is a helper function that is used to determine the count of messages received by the - monitor. - - Parameters - ---------- - x: typing.Union[cudf.DataFrame, MultiMessage, MessageMeta, ControlMessage, typing.List] - Message that determines the count of the message - - Returns - ------- - Message count. - - """ - - # pylint: disable=too-many-return-statements - - if (x is None): - return None - - # Wait for a list thats not empty - if (isinstance(x, list) and len(x) == 0): - return None - - if (isinstance(x, cudf.DataFrame)): - return lambda y: len(y.index) - - if (isinstance(x, MultiMessage)): - return lambda y: y.mess_count - - if (isinstance(x, MessageMeta)): - return lambda y: y.count - - if isinstance(x, ControlMessage): - - def check_df(y): - df = y.payload().df - if df is not None: - return len(df) - - return 0 - - return check_df - - if (isinstance(x, list)): - item_count_fn = self.auto_count_fn(x[0]) - return lambda y: reduce(lambda sum, z, item_count_fn=item_count_fn: sum + item_count_fn(z), y, 0) - - if (isinstance(x, (str, fsspec.core.OpenFile))): - return lambda y: 1 - - if (hasattr(x, "__len__")): - return len # Return len directly (same as `lambda y: len(y)`) - - raise NotImplementedError(f"Unsupported type: {type(x)}") - - def sink_on_completed(self): - """ - Stops the progress bar and prevents the monitors from writing over each other when the last - stage completes. - """ - - # Set the name to complete. This refreshes the display - self.progress.set_description_str(self.progress.desc + "[Complete]") - - self.progress.stop() - - # To prevent the monitors from writing over eachother, stop the monitor when the last stage completes - MonitorController.controller_count -= 1 - - if (MonitorController.controller_count <= 0 and self._tqdm_class.monitor is not None): - self._tqdm_class.monitor.exit() - self._tqdm_class.monitor = None diff --git a/morpheus/utils/schema_transforms.py b/morpheus/utils/schema_transforms.py index 37fa539fb1..8abbccf9c3 100644 --- a/morpheus/utils/schema_transforms.py +++ b/morpheus/utils/schema_transforms.py @@ -22,6 +22,7 @@ import cudf from morpheus.utils.column_info import DataFrameInputSchema +from morpheus.utils.column_info import PreparedDFInfo from morpheus.utils.nvt import patches from morpheus.utils.nvt.extensions import morpheus_ext from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow @@ -101,10 +102,18 @@ def process_dataframe( # Note(Devin): pre-flatten to avoid Dask hang when calling json_normalize within an NVT operator if (input_schema.prep_dataframe is not None): - df_in = input_schema.prep_dataframe(df_in) + prepared_df_info: PreparedDFInfo = input_schema.prep_dataframe(df_in) nvt_workflow = input_schema.nvt_workflow + preserve_df = None + + if prepared_df_info is not None: + df_in = prepared_df_info.df + + if prepared_df_info.columns_to_preserve: + preserve_df = df_in[prepared_df_info.columns_to_preserve] + if (convert_to_pd): df_in = cudf.DataFrame(df_in) @@ -127,6 +136,17 @@ def process_dataframe( df_result.set_index(saved_index.take(df_result.index), inplace=True) if (convert_to_pd): - return df_result.to_pandas() + df_result = df_result.to_pandas() + + # Restore preserved columns + if (preserve_df is not None): + # Ensure there is no overlap with columns to preserve + columns_to_merge = set(preserve_df.columns) - set(df_result.columns) + columns_to_merge = list(columns_to_merge) + if (columns_to_merge): + if (convert_to_pd): + df_result = pd.concat([df_result, preserve_df[columns_to_merge]], axis=1) + else: + df_result = cudf.concat([df_result, preserve_df[columns_to_merge]], axis=1) return df_result diff --git a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py index 7502aaaa78..be0e7c0848 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py +++ b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py @@ -26,6 +26,7 @@ from _utils.dataset_manager import DatasetManager from morpheus.common import FileTypes from morpheus.config import Config +from morpheus.controllers.file_to_df_controller import single_object_to_dataframe from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.column_info import CustomColumn @@ -47,12 +48,11 @@ def single_file_obj(): # pylint: disable=redefined-outer-name def test_single_object_to_dataframe(single_file_obj: fsspec.core.OpenFile): - from dfp.stages.dfp_file_to_df import _single_object_to_dataframe fake_lambda = mock.MagicMock() schema = DataFrameInputSchema(column_info=[CustomColumn(name='data', dtype=str, process_column_fn=fake_lambda)]) - df = _single_object_to_dataframe(single_file_obj, schema, FileTypes.Auto, False, {}) + df = single_object_to_dataframe(single_file_obj, schema, FileTypes.Auto, False, {}) fake_lambda.assert_not_called() assert sorted(df.columns) == sorted(['plugin', 'titles', 'data', 'count']) @@ -67,12 +67,11 @@ def test_single_object_to_dataframe(single_file_obj: fsspec.core.OpenFile): def test_single_object_to_dataframe_timeout(): - from dfp.stages.dfp_file_to_df import _single_object_to_dataframe input_glob = os.path.join(TEST_DIRS.tests_data_dir, 'appshield', 'snapshot-1', 'fake_wont_match*.json') bad_file = fsspec.core.OpenFile(fs=fsspec.open_files(input_glob).fs, path='/tmp/fake/doesnt/exit.csv') - assert _single_object_to_dataframe(bad_file, DataFrameInputSchema(), FileTypes.CSV, False, {}) is None + assert single_object_to_dataframe(bad_file, DataFrameInputSchema(), FileTypes.CSV, False, {}) is None @pytest.mark.usefixtures("restore_environ") @@ -92,11 +91,11 @@ def test_constructor(config: Config): assert isinstance(stage, SinglePortStage) assert isinstance(stage, PreallocatorMixin) - assert stage._schema is schema - assert stage._file_type == FileTypes.PARQUET - assert not stage._filter_null - assert stage._parser_kwargs == {'test': 'this'} - assert stage._cache_dir.startswith('/test/path/cache') + assert stage._controller._schema is schema + assert stage._controller._file_type == FileTypes.PARQUET + assert not stage._controller._filter_null + assert stage._controller._parser_kwargs == {'test': 'this'} + assert stage._controller._cache_dir.startswith('/test/path/cache') # pylint: disable=redefined-outer-name @@ -106,9 +105,9 @@ def test_constructor(config: Config): @mock.patch('multiprocessing.get_context') @mock.patch('dask.distributed.Client') @mock.patch('dask_cuda.LocalCUDACluster') -@mock.patch('dfp.stages.dfp_file_to_df._single_object_to_dataframe') +@mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') @mock.patch('morpheus.utils.downloader.Distributed') -@mock.patch('dfp.stages.dfp_file_to_df.process_dataframe') +@mock.patch('morpheus.controllers.file_to_df_controller.process_dataframe') def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicMock, mock_distributed: mock.MagicMock, mock_obf_to_df: mock.MagicMock, @@ -172,9 +171,9 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM if use_convert_to_dataframe: # convert_to_dataframe is a thin wrapper around _get_or_create_dataframe_from_batch, no need to create # a new test for it - output_df = stage.convert_to_dataframe((batch, 1)) + output_df = stage._controller.convert_to_dataframe((batch, 1)) else: - (output_df, cache_hit) = stage._get_or_create_dataframe_from_batch((batch, 1)) + (output_df, cache_hit) = stage._controller._get_or_create_dataframe_from_batch((batch, 1)) assert not cache_hit if dl_type in ("multiprocess", "multiprocessing"): @@ -200,7 +199,7 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM dataset_pandas.assert_df_equal(output_df, expected_df) - expected_cache_file_path = os.path.join(stage._cache_dir, "batches", f"{expected_hash}.pkl") + expected_cache_file_path = os.path.join(stage._controller._cache_dir, "batches", f"{expected_hash}.pkl") assert os.path.exists(expected_cache_file_path) dataset_pandas.assert_df_equal(pd.read_pickle(expected_cache_file_path), expected_df[dataset_pandas['filter_probs.csv'].columns]) @@ -213,7 +212,7 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM @mock.patch('dask.config') @mock.patch('dask.distributed.Client') @mock.patch('dask_cuda.LocalCUDACluster') -@mock.patch('dfp.stages.dfp_file_to_df._single_object_to_dataframe') +@mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') def test_get_or_create_dataframe_from_batch_cache_hit(mock_obf_to_df: mock.MagicMock, mock_dask_cluster: mock.MagicMock, mock_dask_client: mock.MagicMock, @@ -260,9 +259,9 @@ def test_get_or_create_dataframe_from_batch_cache_hit(mock_obf_to_df: mock.Magic if use_convert_to_dataframe: # convert_to_dataframe is a thin wrapper around _get_or_create_dataframe_from_batch, no need to create # a new test for it - output_df = stage.convert_to_dataframe((batch, 1)) + output_df = stage._controller.convert_to_dataframe((batch, 1)) else: - (output_df, cache_hit) = stage._get_or_create_dataframe_from_batch((batch, 1)) + (output_df, cache_hit) = stage._controller._get_or_create_dataframe_from_batch((batch, 1)) assert cache_hit # When we get a cache hit, none of the download methods should be executed @@ -283,7 +282,7 @@ def test_get_or_create_dataframe_from_batch_cache_hit(mock_obf_to_df: mock.Magic @mock.patch('dask.config') @mock.patch('dask.distributed.Client') @mock.patch('dask_cuda.LocalCUDACluster') -@mock.patch('dfp.stages.dfp_file_to_df._single_object_to_dataframe') +@mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') def test_get_or_create_dataframe_from_batch_none_noop(mock_obf_to_df: mock.MagicMock, mock_dask_cluster: mock.MagicMock, mock_dask_client: mock.MagicMock, @@ -304,10 +303,10 @@ def test_get_or_create_dataframe_from_batch_none_noop(mock_obf_to_df: mock.Magic os.environ['MORPHEUS_FILE_DOWNLOAD_TYPE'] = dl_type stage = DFPFileToDataFrameStage(config, DataFrameInputSchema(), cache_dir=tmp_path) if use_convert_to_dataframe: - assert stage.convert_to_dataframe(None) is None + assert stage._controller.convert_to_dataframe(None) is None else: with pytest.raises(RuntimeError, match="No file objects to process"): - stage._get_or_create_dataframe_from_batch(None) + stage._controller._get_or_create_dataframe_from_batch(None) mock_obf_to_df.assert_not_called() mock_dask_cluster.assert_not_called() diff --git a/tests/examples/digital_fingerprinting/test_dfp_mlflow_model_writer.py b/tests/examples/digital_fingerprinting/test_dfp_mlflow_model_writer.py index caf0dd532a..54b438d4a3 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_mlflow_model_writer.py +++ b/tests/examples/digital_fingerprinting/test_dfp_mlflow_model_writer.py @@ -61,11 +61,12 @@ def mock_requests_fixture(): yield MockedRequests(mock_requests_get, mock_requests_patch, mock_response) -@pytest.fixture(name="mock_mlflow") -def mock_mlflow_fixture(): - with (mock.patch("dfp.stages.dfp_mlflow_model_writer.MlflowClient") as mock_mlflow_client, - mock.patch("dfp.stages.dfp_mlflow_model_writer.ModelSignature") as mock_model_signature, - mock.patch("dfp.stages.dfp_mlflow_model_writer.RunsArtifactRepository") as mock_runs_artifact_repository, +@pytest.fixture +def mock_mlflow(): + with (mock.patch("morpheus.controllers.mlflow_model_writer_controller.MlflowClient") as mock_mlflow_client, + mock.patch("morpheus.controllers.mlflow_model_writer_controller.ModelSignature") as mock_model_signature, + mock.patch("morpheus.controllers.mlflow_model_writer_controller.RunsArtifactRepository") as + mock_runs_artifact_repository, mock.patch("mlflow.end_run") as mock_mlflow_end_run, mock.patch("mlflow.get_tracking_uri") as mock_mlflow_get_tracking_uri, mock.patch("mlflow.log_metrics") as mock_mlflow_log_metrics, @@ -114,9 +115,9 @@ def test_constructor(config: Config): experiment_name_formatter="/test/{user_id}-{user_md5}-{reg_model_name}", databricks_permissions={'test': 'this'}) assert isinstance(stage, SinglePortStage) - assert stage._model_name_formatter == "test_model_name-{user_id}-{user_md5}" - assert stage._experiment_name_formatter == "/test/{user_id}-{user_md5}-{reg_model_name}" - assert stage._databricks_permissions == {'test': 'this'} + assert stage._controller.model_name_formatter == "test_model_name-{user_id}-{user_md5}" + assert stage._controller.experiment_name_formatter == "/test/{user_id}-{user_md5}-{reg_model_name}" + assert stage._controller.databricks_permissions == {'test': 'this'} @pytest.mark.parametrize( @@ -131,7 +132,7 @@ def test_user_id_to_model(config: Config, model_name_formatter: str, user_id: st from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage stage = DFPMLFlowModelWriterStage(config, model_name_formatter=model_name_formatter) - assert stage.user_id_to_model(user_id) == expected_val + assert stage._controller.user_id_to_model(user_id) == expected_val @pytest.mark.parametrize("experiment_name_formatter,user_id,expected_val", @@ -151,7 +152,7 @@ def test_user_id_to_experiment(config: Config, experiment_name_formatter: str, u stage = DFPMLFlowModelWriterStage(config, model_name_formatter="dfp-{user_id}", experiment_name_formatter=experiment_name_formatter) - assert stage.user_id_to_experiment(user_id) == expected_val + assert stage._controller.user_id_to_experiment(user_id) == expected_val def verify_apply_model_permissions(mock_requests: MockedRequests, @@ -177,8 +178,8 @@ def verify_apply_model_permissions(mock_requests: MockedRequests, def test_apply_model_permissions(config: Config, databricks_env: dict, mock_requests: MockedRequests): from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage databricks_permissions = OrderedDict([('group1', 'CAN_READ'), ('group2', 'CAN_WRITE')]) - stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions) - stage._apply_model_permissions("test_experiment") + stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions, timeout=10) + stage._controller._apply_model_permissions("test_experiment") verify_apply_model_permissions(mock_requests, databricks_env, databricks_permissions, 'test_experiment') @@ -206,7 +207,7 @@ def test_apply_model_permissions_no_perms_error(config: Config, from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage stage = DFPMLFlowModelWriterStage(config) with pytest.raises(RuntimeError): - stage._apply_model_permissions("test_experiment") + stage._controller._apply_model_permissions("test_experiment") mock_requests.get.assert_not_called() mock_requests.patch.assert_not_called() @@ -217,8 +218,8 @@ def test_apply_model_permissions_requests_error(config: Config, mock_requests: M from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage mock_requests.get.side_effect = RuntimeError("test error") - stage = DFPMLFlowModelWriterStage(config) - stage._apply_model_permissions("test_experiment") + stage = DFPMLFlowModelWriterStage(config, timeout=10) + stage._controller._apply_model_permissions("test_experiment") # This method just catches and logs any errors mock_requests.get.assert_called_once() @@ -227,13 +228,14 @@ def test_apply_model_permissions_requests_error(config: Config, mock_requests: M @pytest.mark.parametrize("databricks_permissions", [None, {}]) @pytest.mark.parametrize("tracking_uri", ['file:///home/user/morpheus/mlruns', "databricks"]) -def test_on_data(config: Config, - mock_mlflow: MockedMLFlow, - mock_requests: MockedRequests, - dataset_pandas: DatasetManager, - databricks_env: dict, - databricks_permissions: dict, - tracking_uri: str): +def test_on_data( + config: Config, + mock_mlflow: MockedMLFlow, # pylint: disable=redefined-outer-name + mock_requests: MockedRequests, + dataset_pandas: DatasetManager, + databricks_env: dict, + databricks_permissions: dict, + tracking_uri: str): from dfp.messages.multi_dfp_message import DFPMessageMeta from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage from dfp.stages.dfp_mlflow_model_writer import conda_env @@ -271,8 +273,8 @@ def test_on_data(config: Config, meta = DFPMessageMeta(df, 'Account-123456789') msg = MultiAEMessage(meta=meta, model=mock_model) - stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions) - assert stage.on_data(msg) is msg # Should be a pass-thru + stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions, timeout=10) + assert stage._controller.on_data(msg) is msg # Should be a pass-thru # Test mocks in order that they're called mock_mlflow.end_run.assert_called_once() diff --git a/tests/test_cli.py b/tests/test_cli.py index a2379fe074..23408165ee 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -244,7 +244,7 @@ def test_pipeline_ae(self, config, callback_values): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' @pytest.mark.replace_callback('pipeline_ae') def test_pipeline_ae_all(self, callback_values): @@ -338,7 +338,7 @@ def test_pipeline_ae_all(self, callback_values): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' assert isinstance(to_kafka, WriteToKafkaStage) assert to_kafka._kafka_conf['bootstrap.servers'] == 'kserv1:123,kserv2:321' @@ -404,7 +404,7 @@ def test_pipeline_fil(self, config, callback_values): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' @pytest.mark.replace_callback('pipeline_fil') def test_pipeline_fil_all(self, config, callback_values, tmp_path, mlflow_uri): @@ -528,7 +528,7 @@ def test_pipeline_fil_all(self, config, callback_values, tmp_path, mlflow_uri): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' assert isinstance(to_kafka, WriteToKafkaStage) assert to_kafka._kafka_conf['bootstrap.servers'] == 'kserv1:123,kserv2:321' @@ -624,7 +624,7 @@ def test_enum_parsing(self, config, callback_values, tmp_path, mlflow_uri): assert isinstance(deserialize, DeserializeStage) assert isinstance(filter_stage, FilterDetectionsStage) - assert filter_stage._filter_source == FilterSource.TENSOR + assert filter_stage._controller._filter_source == FilterSource.TENSOR assert isinstance(dropna, DropNullStage) assert dropna._column == 'xyz' @@ -662,8 +662,8 @@ def test_enum_parsing(self, config, callback_values, tmp_path, mlflow_uri): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' - assert to_file._file_type == FileTypes.CSV + assert to_file._controller._output_file == 'out.csv' + assert to_file._controller._file_type == FileTypes.CSV assert isinstance(to_kafka, WriteToKafkaStage) assert to_kafka._kafka_conf['bootstrap.servers'] == 'kserv1:123,kserv2:321' @@ -745,7 +745,7 @@ def test_pipeline_nlp(self, config, callback_values): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' @pytest.mark.replace_callback('pipeline_nlp') def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri): @@ -877,7 +877,7 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri): assert isinstance(serialize, SerializeStage) assert isinstance(to_file, WriteToFileStage) - assert to_file._output_file == 'out.csv' + assert to_file._controller._output_file == 'out.csv' assert isinstance(to_kafka, WriteToKafkaStage) assert to_kafka._kafka_conf['bootstrap.servers'] == 'kserv1:123,kserv2:321' diff --git a/tests/test_filter_detections_stage.py b/tests/test_filter_detections_stage.py index 9eeead93e2..e147d17d34 100755 --- a/tests/test_filter_detections_stage.py +++ b/tests/test_filter_detections_stage.py @@ -40,7 +40,7 @@ def test_constructor(config): assert len(accepted_types) > 0 fds = FilterDetectionsStage(config, threshold=0.2) - assert fds._threshold == 0.2 + assert fds._controller._threshold == 0.2 @pytest.mark.use_cudf @@ -52,7 +52,7 @@ def test_filter_copy(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) # All values are at or below the threshold so nothing should be returned - output_message = fds.filter_copy(mock_message) + output_message = fds._controller.filter_copy(mock_message) assert output_message is None # Only one row has a value above the threshold @@ -64,7 +64,7 @@ def test_filter_copy(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_message = fds.filter_copy(mock_message) + output_message = fds._controller.filter_copy(mock_message) assert output_message.get_meta().to_cupy().tolist() == filter_probs_df.loc[1:1, :].to_cupy().tolist() # Two adjacent rows have a value above the threashold @@ -78,7 +78,7 @@ def test_filter_copy(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_message = fds.filter_copy(mock_message) + output_message = fds._controller.filter_copy(mock_message) assert output_message.get_meta().to_cupy().tolist() == filter_probs_df.loc[2:3, :].to_cupy().tolist() # Two non-adjacent rows have a value above the threashold @@ -93,7 +93,7 @@ def test_filter_copy(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_message = fds.filter_copy(mock_message) + output_message = fds._controller.filter_copy(mock_message) mask = cp.zeros(len(filter_probs_df), dtype=cp.bool_) mask[2] = True mask[4] = True @@ -118,7 +118,7 @@ def test_filter_column(config, filter_probs_df, do_copy, threshold, field_name): mock_message = _make_message(filter_probs_df, probs) # All values are at or below the threshold - output_message = fds.filter_copy(mock_message) + output_message = fds._controller.filter_copy(mock_message) assert output_message.get_meta().to_cupy().tolist() == expected_df.to_numpy().tolist() @@ -132,7 +132,7 @@ def test_filter_slice(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) # All values are at or below the threshold - output_messages = fds.filter_slice(mock_message) + output_messages = fds._controller.filter_slice(mock_message) assert len(output_messages) == 0 # Only one row has a value above the threshold @@ -144,7 +144,7 @@ def test_filter_slice(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_messages = fds.filter_slice(mock_message) + output_messages = fds._controller.filter_slice(mock_message) assert len(output_messages) == 1 output_message = output_messages[0] assert output_message.get_meta().to_cupy().tolist() == filter_probs_df.loc[1:1, :].to_cupy().tolist() @@ -160,7 +160,7 @@ def test_filter_slice(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_messages = fds.filter_slice(mock_message) + output_messages = fds._controller.filter_slice(mock_message) assert len(output_messages) == 1 output_message = output_messages[0] assert output_message.offset == 2 @@ -179,7 +179,7 @@ def test_filter_slice(config, filter_probs_df): mock_message = _make_message(filter_probs_df, probs) - output_messages = fds.filter_slice(mock_message) + output_messages = fds._controller.filter_slice(mock_message) assert len(output_messages) == 2 (msg1, msg2) = output_messages # pylint: disable=unbalanced-tuple-unpacking assert msg1.offset == 2 diff --git a/tests/test_monitor_stage.py b/tests/test_monitor_stage.py index 586bb04e75..1e6e045459 100755 --- a/tests/test_monitor_stage.py +++ b/tests/test_monitor_stage.py @@ -59,7 +59,7 @@ def two_x(x): assert stage._mc._determine_count_fn is two_x -@mock.patch('morpheus.utils.monitor_utils.MorpheusTqdm') +@mock.patch('morpheus.controllers.monitor_controller.MorpheusTqdm') def test_on_start(mock_morph_tqdm, config): mock_morph_tqdm.return_value = mock_morph_tqdm @@ -72,7 +72,7 @@ def test_on_start(mock_morph_tqdm, config): assert stage._mc._progress is mock_morph_tqdm -@mock.patch('morpheus.utils.monitor_utils.MorpheusTqdm') +@mock.patch('morpheus.controllers.monitor_controller.MorpheusTqdm') def test_stop(mock_morph_tqdm, config): mock_morph_tqdm.return_value = mock_morph_tqdm @@ -88,7 +88,7 @@ def test_stop(mock_morph_tqdm, config): mock_morph_tqdm.close.assert_called_once() -@mock.patch('morpheus.utils.monitor_utils.MorpheusTqdm') +@mock.patch('morpheus.controllers.monitor_controller.MorpheusTqdm') def test_refresh(mock_morph_tqdm, config): mock_morph_tqdm.return_value = mock_morph_tqdm @@ -134,7 +134,7 @@ def test_auto_count_fn_not_impl(config, value: typing.Any): stage._mc.auto_count_fn(value) -@mock.patch('morpheus.utils.monitor_utils.MorpheusTqdm') +@mock.patch('morpheus.controllers.monitor_controller.MorpheusTqdm') def test_progress_sink(mock_morph_tqdm, config): mock_morph_tqdm.return_value = mock_morph_tqdm diff --git a/tests/test_serialize_stage.py b/tests/test_serialize_stage.py index 0f596b5980..9030a19e90 100755 --- a/tests/test_serialize_stage.py +++ b/tests/test_serialize_stage.py @@ -42,16 +42,16 @@ def test_fixed_columns(config): include_re_str = '^app.*' include_re = re.compile(include_re_str) - s = SerializeStage(config, include=[include_re_str], fixed_columns=True) - meta1 = s.convert_to_df(mm1, include_columns=include_re, exclude_columns=[]) - meta2 = s.convert_to_df(mm2, include_columns=include_re, exclude_columns=[]) + stage = SerializeStage(config, include=[include_re_str], fixed_columns=True) + meta1 = stage._controller.convert_to_df(mm1, include_columns=include_re, exclude_columns=[]) + meta2 = stage._controller.convert_to_df(mm2, include_columns=include_re, exclude_columns=[]) assert meta1.df.columns.to_list() == ['apples', 'apple_sauce'] assert meta2.df.columns.to_list() == ['apples', 'apple_sauce'] - s = SerializeStage(config, include=[include_re_str], fixed_columns=False) - meta1 = s.convert_to_df(mm1, include_columns=include_re, exclude_columns=[]) - meta2 = s.convert_to_df(mm2, include_columns=include_re, exclude_columns=[]) + stage = SerializeStage(config, include=[include_re_str], fixed_columns=False) + meta1 = stage._controller.convert_to_df(mm1, include_columns=include_re, exclude_columns=[]) + meta2 = stage._controller.convert_to_df(mm2, include_columns=include_re, exclude_columns=[]) assert meta1.df.columns.to_list() == ['apples', 'apple_sauce'] assert meta2.df.columns.to_list() == ['apples', 'applause', 'apple_sauce'] diff --git a/tests/utils/nvt/test_schema_converters.py b/tests/utils/nvt/test_schema_converters.py index 917f5cf90a..03270a6da5 100644 --- a/tests/utils/nvt/test_schema_converters.py +++ b/tests/utils/nvt/test_schema_converters.py @@ -26,6 +26,7 @@ from morpheus.utils.column_info import DateTimeColumn from morpheus.utils.column_info import DistinctIncrementColumn from morpheus.utils.column_info import IncrementColumn +from morpheus.utils.column_info import PreparedDFInfo from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.column_info import StringJoinColumn @@ -361,8 +362,8 @@ def test_input_schema_conversion_interdependent_columns(): test_df["application"] = ['{"name": "AnotherApp", "version": "1.0"}'] modified_schema = create_and_attach_nvt_workflow(modified_schema) - test_df = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(test_df) + prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) + dataset = nvt.Dataset(prepared_df_info.df) output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() expected_df = pd.DataFrame({ @@ -399,8 +400,8 @@ def test_input_schema_conversion_nested_operations(): modified_schema.column_info.append(ColumnInfo(name="appsuffix", dtype="str")) modified_schema = create_and_attach_nvt_workflow(modified_schema) - test_df = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(test_df) + prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) + dataset = nvt.Dataset(prepared_df_info.df) output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() expected_df = pd.DataFrame({ @@ -503,8 +504,8 @@ def test_input_schema_conversion(): modified_schema = create_and_attach_nvt_workflow(example_schema) # Apply the returned nvt.Workflow to the test dataframe - test_df = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(test_df) + prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) + dataset = nvt.Dataset(prepared_df_info.df) output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() # Check if the output dataframe has the expected schema and values @@ -587,8 +588,8 @@ def test_input_schema_conversion_with_functional_filter(): example_schema = create_and_attach_nvt_workflow(example_schema) # Apply the returned nvt.Workflow to the test dataframe - test_df = example_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(test_df) + prepared_df_info: PreparedDFInfo = example_schema.prep_dataframe(test_df) + dataset = nvt.Dataset(prepared_df_info.df) output_df = example_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() # Check if the output dataframe has the expected schema and values