From 11697ded2cd0225723d505d84a31d756c16d988c Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Mon, 25 Sep 2023 12:21:56 -0400 Subject: [PATCH] Phishing example fix (#1215) - Update labels file to reflect actual model outputs: `not_phishing` and `is_phishing` - Add `AddScores` stage to Python and CLI examples - Remove `FilterDetectionsStage` because model output contains probabilities for both `not_phishing` and `is_phishing`one of which will always exceed threshold. This results in nothing ever being filtered out. Closes #1208 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1215 --- .../guides/2_real_world_phishing.md | 20 +++++++++++-------- models/data/labels_phishing.txt | 4 ++-- tests/test_cli.py | 16 +++++++-------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/source/developer_guide/guides/2_real_world_phishing.md b/docs/source/developer_guide/guides/2_real_world_phishing.md index 0e5ed2f5ef..71c132d608 100644 --- a/docs/source/developer_guide/guides/2_real_world_phishing.md +++ b/docs/source/developer_guide/guides/2_real_world_phishing.md @@ -406,7 +406,8 @@ Note that the tokenizer parameters and vocabulary hash file should exactly match At this point, we have a pipeline that reads in a set of records and pre-processes them with the metadata required for our classifier to make predictions. Our next step is to define a stage that applies a machine learning model to our `MessageMeta` object. To accomplish this, we will be using Morpheus' `TritonInferenceStage`. This stage will handle communication with the `phishing-bert-onnx` model, which we provided to the Triton Docker container via the `models` directory mount. -Next we will add a monitor stage to measure the inference rate as well as a filter stage to filter out any results below a probability threshold of `0.9`. +Next we will add a monitor stage to measure the inference rate: + ```python # Add an inference stage pipeline.add_stage( @@ -418,14 +419,17 @@ pipeline.add_stage( )) pipeline.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) +``` -# Filter values lower than 0.9 -pipeline.add_stage(FilterDetectionsStage(config, threshold=0.9)) +Here we add a postprocessing stage that adds the probability score for `is_phishing`: + +```python +pipeline.add_stage(AddScoresStage(config, labels=["is_phishing"])) ``` Lastly, we will save our results to disk. For this purpose, we are using two stages that are often used in conjunction with each other: `SerializeStage` and `WriteToFileStage`. -The `SerializeStage` is used to include and exclude columns as desired in the output. Importantly, it also handles conversion from the `MultiMessage`-derived output type that is used by the `FilterDetectionsStage` to the `MessageMeta` class that is expected as input by the `WriteToFileStage`. +The `SerializeStage` is used to include and exclude columns as desired in the output. Importantly, it also handles conversion from the `MultiMessage`-derived output type to the `MessageMeta` class that is expected as input by the `WriteToFileStage`. The `WriteToFileStage` will append message data to the output file as messages are received. Note however that for performance reasons the `WriteToFileStage` does not flush its contents out to disk every time a message is received. Instead, it relies on the underlying [buffered output stream](https://gcc.gnu.org/onlinedocs/libstdc++/manual/streambufs.html) to flush as needed, and then will close the file handle on shutdown. @@ -456,7 +460,7 @@ from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage -from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage +from morpheus.stages.postprocess.add_scores_stage import AddScoresStage from morpheus.stages.postprocess.serialize_stage import SerializeStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage @@ -522,8 +526,8 @@ def run_pipeline(): # Monitor the inference rate pipeline.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) - # Filter values lower than 0.9 - pipeline.add_stage(FilterDetectionsStage(config, threshold=0.9)) + # Add probability score for is_phishing + pipeline.add_stage(AddScoresStage(config, labels=["is_phishing"])) # Write the to the output file pipeline.add_stage(SerializeStage(config)) @@ -550,7 +554,7 @@ morpheus --log_level=debug --plugin examples/developer_guide/2_1_real_world_phis preprocess --vocab_hash_file=data/bert-base-uncased-hash.txt --truncation=true --do_lower_case=true --add_special_tokens=false \ inf-triton --model_name=phishing-bert-onnx --server_url=localhost:8001 --force_convert_inputs=true \ monitor --description="Inference Rate" --smoothing=0.001 --unit=inf \ - filter --threshold=0.9 --filter_source=TENSOR \ + add-scores --label=is_phishing \ serialize \ to-file --filename=/tmp/detections.jsonlines --overwrite ``` diff --git a/models/data/labels_phishing.txt b/models/data/labels_phishing.txt index 8eeea320f8..1a91ac21f3 100644 --- a/models/data/labels_phishing.txt +++ b/models/data/labels_phishing.txt @@ -1,2 +1,2 @@ -score -pred +not_phishing +is_phishing diff --git a/tests/test_cli.py b/tests/test_cli.py index 23408165ee..3cb0efea43 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -685,8 +685,8 @@ def test_pipeline_nlp(self, config, callback_values): '--truncation=True', '--do_lower_case=True', '--add_special_tokens=False' - ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=pred', '--threshold=0.7'] + VALIDATE_ARGS + - ['serialize'] + TO_FILE_ARGS) + ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=is_phishing', '--threshold=0.7'] + + VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS) obj = {} runner = CliRunner() @@ -696,7 +696,7 @@ def test_pipeline_nlp(self, config, callback_values): # Ensure our config is populated correctly config = obj["config"] assert config.mode == PipelineModes.NLP - assert config.class_labels == ["score", "pred"] + assert config.class_labels == ["not_phishing", "is_phishing"] assert config.feature_length == 128 assert config.ae is None @@ -731,7 +731,7 @@ def test_pipeline_nlp(self, config, callback_values): assert monitor._mc._unit == 'inf' assert isinstance(add_class, AddClassificationsStage) - assert add_class._labels == ('pred', ) + assert add_class._labels == ('is_phishing', ) assert add_class._threshold == 0.7 assert isinstance(validation, ValidationStage) @@ -781,8 +781,8 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri): 'mlflow-drift', '--tracking_uri', mlflow_uri - ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=pred', '--threshold=0.7'] + VALIDATE_ARGS + - ['serialize'] + TO_FILE_ARGS + TO_KAFKA_ARGS) + ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=is_phishing', '--threshold=0.7'] + + VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS + TO_KAFKA_ARGS) obj = {} runner = CliRunner() @@ -792,7 +792,7 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri): # Ensure our config is populated correctly config = obj["config"] assert config.mode == PipelineModes.NLP - assert config.class_labels == ["score", "pred"] + assert config.class_labels == ["not_phishing", "is_phishing"] assert config.feature_length == 128 assert config.ae is None @@ -864,7 +864,7 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri): assert monitor._mc._unit == 'inf' assert isinstance(add_class, AddClassificationsStage) - assert add_class._labels == ('pred', ) + assert add_class._labels == ('is_phishing', ) assert add_class._threshold == 0.7 assert isinstance(validation, ValidationStage)