From 11697ded2cd0225723d505d84a31d756c16d988c Mon Sep 17 00:00:00 2001
From: Eli Fajardo <efajardo@nvidia.com>
Date: Mon, 25 Sep 2023 12:21:56 -0400
Subject: [PATCH] Phishing example fix (#1215)

- Update labels file to reflect actual model outputs: `not_phishing` and `is_phishing`
- Add `AddScores` stage to Python and CLI examples
- Remove `FilterDetectionsStage` because model output contains probabilities for both `not_phishing` and `is_phishing`one of which will always exceed threshold. This results in nothing ever being filtered out.

Closes #1208

## By Submitting this PR I confirm:
- I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md).
- When the PR is ready for review, new or existing tests cover these changes.
- When the PR is ready for review, the documentation is up to date with these changes.

Authors:
  - Eli Fajardo (https://github.com/efajardo-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/Morpheus/pull/1215
---
 .../guides/2_real_world_phishing.md           | 20 +++++++++++--------
 models/data/labels_phishing.txt               |  4 ++--
 tests/test_cli.py                             | 16 +++++++--------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/docs/source/developer_guide/guides/2_real_world_phishing.md b/docs/source/developer_guide/guides/2_real_world_phishing.md
index 0e5ed2f5ef..71c132d608 100644
--- a/docs/source/developer_guide/guides/2_real_world_phishing.md
+++ b/docs/source/developer_guide/guides/2_real_world_phishing.md
@@ -406,7 +406,8 @@ Note that the tokenizer parameters and vocabulary hash file should exactly match
 
 At this point, we have a pipeline that reads in a set of records and pre-processes them with the metadata required for our classifier to make predictions. Our next step is to define a stage that applies a machine learning model to our `MessageMeta` object. To accomplish this, we will be using Morpheus' `TritonInferenceStage`. This stage will handle communication with the `phishing-bert-onnx` model, which we provided to the Triton Docker container via the `models` directory mount.
 
-Next we will add a monitor stage to measure the inference rate as well as a filter stage to filter out any results below a probability threshold of `0.9`.
+Next we will add a monitor stage to measure the inference rate:
+
 ```python
 # Add an inference stage
 pipeline.add_stage(
@@ -418,14 +419,17 @@ pipeline.add_stage(
     ))
 
 pipeline.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
+```
 
-# Filter values lower than 0.9
-pipeline.add_stage(FilterDetectionsStage(config, threshold=0.9))
+Here we add a postprocessing stage that adds the probability score for `is_phishing`:
+
+```python
+pipeline.add_stage(AddScoresStage(config, labels=["is_phishing"]))
 ```
 
 Lastly, we will save our results to disk. For this purpose, we are using two stages that are often used in conjunction with each other: `SerializeStage` and `WriteToFileStage`.
 
-The `SerializeStage` is used to include and exclude columns as desired in the output. Importantly, it also handles conversion from the `MultiMessage`-derived output type that is used by the `FilterDetectionsStage` to the `MessageMeta` class that is expected as input by the `WriteToFileStage`.
+The `SerializeStage` is used to include and exclude columns as desired in the output. Importantly, it also handles conversion from the `MultiMessage`-derived output type to the `MessageMeta` class that is expected as input by the `WriteToFileStage`.
 
 The `WriteToFileStage` will append message data to the output file as messages are received. Note however that for performance reasons the `WriteToFileStage` does not flush its contents out to disk every time a message is received. Instead, it relies on the underlying [buffered output stream](https://gcc.gnu.org/onlinedocs/libstdc++/manual/streambufs.html) to flush as needed, and then will close the file handle on shutdown.
 
@@ -456,7 +460,7 @@ from morpheus.stages.general.monitor_stage import MonitorStage
 from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage
 from morpheus.stages.input.file_source_stage import FileSourceStage
 from morpheus.stages.output.write_to_file_stage import WriteToFileStage
-from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage
+from morpheus.stages.postprocess.add_scores_stage import AddScoresStage
 from morpheus.stages.postprocess.serialize_stage import SerializeStage
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
 from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
@@ -522,8 +526,8 @@ def run_pipeline():
     # Monitor the inference rate
     pipeline.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
 
-    # Filter values lower than 0.9
-    pipeline.add_stage(FilterDetectionsStage(config, threshold=0.9))
+    # Add probability score for is_phishing
+    pipeline.add_stage(AddScoresStage(config, labels=["is_phishing"]))
 
     # Write the to the output file
     pipeline.add_stage(SerializeStage(config))
@@ -550,7 +554,7 @@ morpheus --log_level=debug --plugin examples/developer_guide/2_1_real_world_phis
   preprocess --vocab_hash_file=data/bert-base-uncased-hash.txt --truncation=true --do_lower_case=true --add_special_tokens=false \
   inf-triton --model_name=phishing-bert-onnx --server_url=localhost:8001 --force_convert_inputs=true \
   monitor --description="Inference Rate" --smoothing=0.001 --unit=inf \
-  filter --threshold=0.9 --filter_source=TENSOR \
+  add-scores --label=is_phishing \
   serialize \
   to-file --filename=/tmp/detections.jsonlines --overwrite
 ```
diff --git a/models/data/labels_phishing.txt b/models/data/labels_phishing.txt
index 8eeea320f8..1a91ac21f3 100644
--- a/models/data/labels_phishing.txt
+++ b/models/data/labels_phishing.txt
@@ -1,2 +1,2 @@
-score
-pred
+not_phishing
+is_phishing
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 23408165ee..3cb0efea43 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -685,8 +685,8 @@ def test_pipeline_nlp(self, config, callback_values):
                     '--truncation=True',
                     '--do_lower_case=True',
                     '--add_special_tokens=False'
-                ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=pred', '--threshold=0.7'] + VALIDATE_ARGS +
-                ['serialize'] + TO_FILE_ARGS)
+                ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=is_phishing', '--threshold=0.7'] +
+                VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS)
 
         obj = {}
         runner = CliRunner()
@@ -696,7 +696,7 @@ def test_pipeline_nlp(self, config, callback_values):
         # Ensure our config is populated correctly
         config = obj["config"]
         assert config.mode == PipelineModes.NLP
-        assert config.class_labels == ["score", "pred"]
+        assert config.class_labels == ["not_phishing", "is_phishing"]
         assert config.feature_length == 128
 
         assert config.ae is None
@@ -731,7 +731,7 @@ def test_pipeline_nlp(self, config, callback_values):
         assert monitor._mc._unit == 'inf'
 
         assert isinstance(add_class, AddClassificationsStage)
-        assert add_class._labels == ('pred', )
+        assert add_class._labels == ('is_phishing', )
         assert add_class._threshold == 0.7
 
         assert isinstance(validation, ValidationStage)
@@ -781,8 +781,8 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri):
                     'mlflow-drift',
                     '--tracking_uri',
                     mlflow_uri
-                ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=pred', '--threshold=0.7'] + VALIDATE_ARGS +
-                ['serialize'] + TO_FILE_ARGS + TO_KAFKA_ARGS)
+                ] + INF_TRITON_ARGS + MONITOR_ARGS + ['add-class', '--label=is_phishing', '--threshold=0.7'] +
+                VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS + TO_KAFKA_ARGS)
 
         obj = {}
         runner = CliRunner()
@@ -792,7 +792,7 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri):
         # Ensure our config is populated correctly
         config = obj["config"]
         assert config.mode == PipelineModes.NLP
-        assert config.class_labels == ["score", "pred"]
+        assert config.class_labels == ["not_phishing", "is_phishing"]
         assert config.feature_length == 128
 
         assert config.ae is None
@@ -864,7 +864,7 @@ def test_pipeline_nlp_all(self, config, callback_values, tmp_path, mlflow_uri):
         assert monitor._mc._unit == 'inf'
 
         assert isinstance(add_class, AddClassificationsStage)
-        assert add_class._labels == ('pred', )
+        assert add_class._labels == ('is_phishing', )
         assert add_class._threshold == 0.7
 
         assert isinstance(validation, ValidationStage)