From b5dd291cb295c0ad8f87e5e0e7fe8bb32ab53bb7 Mon Sep 17 00:00:00 2001
From: Thanawan Atchariyachanvanit <latchari@amazon.com>
Date: Fri, 7 Jul 2023 11:48:24 -0700
Subject: [PATCH] Fix make_model_config_json function (#188)

* Improve make_model_config_json function

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Fix linting issues

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Add CHANGELOG.md

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Add unittest

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Correct linting

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Correct linting

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Fix bug

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Minor Edit + Add more tests

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Fix test

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Correct typos

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Increase test coverage

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Increase test coverage (2)

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Increase test coverage (3)

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Remove redundant line

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

---------

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
(cherry picked from commit 13149e8c089a0ffdf2b587b09e59b2990db24779)
---
 CHANGELOG.md                                  |   1 +
 .../ml_models/sentencetransformermodel.py     | 100 +++++--
 .../test_sentencetransformermodel_pytest.py   | 282 ++++++++++++++++++
 3 files changed, 366 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 003129b7..7baf3238 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ### Fixed
 - Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185))
+- Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188))
 
 ## [1.0.0]    
 
diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
index 42bac514..e22b574f 100644
--- a/opensearch_py_ml/ml_models/sentencetransformermodel.py
+++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -978,11 +978,14 @@ def make_model_config_json(
         self,
         model_name: str = None,
         version_number: str = 1,
+        model_format: str = "TORCH_SCRIPT",
         embedding_dimension: int = None,
+        pooling_mode: str = None,
+        normalize_result: bool = None,
         all_config: str = None,
         model_type: str = None,
         verbose: bool = False,
-    ) -> None:
+    ) -> str:
         """
         parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required
         fields are given by users, use the given parameters and will skip reading the config.json
@@ -991,12 +994,21 @@ def make_model_config_json(
             Optional, The name of the model. If None, default to parse from model id, for example,
             'msmarco-distilbert-base-tas-b'
         :type model_name: string
+        :param model_format:
+            Optional, The format of the model. Default is "TORCH_SCRIPT".
+        :type model_format: string
         :param version_number:
-            Optional, The version number of the model. default is 1
+            Optional, The version number of the model. Default is 1
         :type version_number: string
-        :param embedding_dimension: Optional, the embedding_dimension of the model. If None, parse embedding_dimension
-            from the config file of pre-trained hugging-face model, if not found, default to be 768
+        :param embedding_dimension: Optional, the embedding dimension of the model. If None, parse embedding_dimension
+            from the config file of pre-trained hugging-face model. If not found, default to be 768
         :type embedding_dimension: int
+        :param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode
+            from the config file of pre-trained hugging-face model. If not found, do not include it.
+        :type pooling_mode: string
+        :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder
+            exists in the pre-trained hugging-face model folder. If not found, do not include it.
+        :type normalize_result: bool
         :param all_config:
             Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained
             hugging-face model
@@ -1008,8 +1020,8 @@ def make_model_config_json(
         :param verbose:
             optional, use printing more logs. Default as false
         :type verbose: bool
-        :return: no return value expected
-        :rtype: None
+        :return: model config file path. The file path where the model config file is being saved
+        :rtype: string
         """
         folder_path = self.folder_path
         config_json_file_path = os.path.join(folder_path, "config.json")
@@ -1057,27 +1069,25 @@ def make_model_config_json(
                             if mapping_item in config_content.keys():
                                 embedding_dimension = config_content[mapping_item]
                                 break
-                            else:
-                                print(
-                                    'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ',
-                                    config_json_file_path,
-                                )
-                                print(
-                                    "Please add in the config file or input in the argument for embedding_dimension "
-                                )
-                                embedding_dimension = 768
+                        else:
+                            print(
+                                'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ',
+                                config_json_file_path,
+                                ". Please add in the config file or input in the argument for embedding_dimension.",
+                            )
+                            embedding_dimension = 768
             except IOError:
                 print(
                     "Cannot open in config.json file at ",
                     config_json_file_path,
-                    ". Please check the config.son ",
+                    ". Please check the config.json ",
                     "file in the path.",
                 )
 
         model_config_content = {
             "name": model_name,
             "version": version_number,
-            "model_format": "TORCH_SCRIPT",
+            "model_format": model_format,
             "model_task_type": "TEXT_EMBEDDING",
             "model_config": {
                 "model_type": model_type,
@@ -1086,6 +1096,60 @@ def make_model_config_json(
                 "all_config": json.dumps(all_config),
             },
         }
+
+        if pooling_mode is not None:
+            model_config_content["model_config"]["pooling_mode"] = pooling_mode
+        else:
+            pooling_config_json_file_path = os.path.join(
+                folder_path, "1_Pooling", "config.json"
+            )
+            if os.path.exists(pooling_config_json_file_path):
+                try:
+                    with open(pooling_config_json_file_path) as f:
+                        if verbose:
+                            print(
+                                "reading pooling config file from: "
+                                + pooling_config_json_file_path
+                            )
+                        pooling_config_content = json.load(f)
+                        pooling_mode_mapping_dict = {
+                            "pooling_mode_cls_token": "CLS",
+                            "pooling_mode_mean_tokens": "MEAN",
+                            "pooling_mode_max_tokens": "MAX",
+                            "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN",
+                        }
+                        for mapping_item in pooling_mode_mapping_dict:
+                            if (
+                                mapping_item in pooling_config_content.keys()
+                                and pooling_config_content[mapping_item]
+                            ):
+                                pooling_mode = pooling_mode_mapping_dict[mapping_item]
+                                model_config_content["model_config"][
+                                    "pooling_mode"
+                                ] = pooling_mode
+                                break
+                        else:
+                            print(
+                                'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ',
+                                pooling_config_json_file_path,
+                                ". Please add in the pooling config file or input in the argument for pooling_mode.",
+                            )
+
+                except IOError:
+                    print(
+                        "Cannot open in config.json file at ",
+                        pooling_config_json_file_path,
+                        ". Please check the config.json ",
+                        "file in the path.",
+                    )
+
+        if normalize_result is not None:
+            model_config_content["model_config"]["normalize_result"] = normalize_result
+        else:
+            normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize")
+            if os.path.exists(normalize_result_json_file_path):
+                model_config_content["model_config"]["normalize_result"] = True
+
         if verbose:
             print("generating ml-commons_model_config.json file...\n")
             print(model_config_content)
@@ -1100,6 +1164,8 @@ def make_model_config_json(
             "ml-commons_model_config.json file is saved at : ", model_config_file_path
         )
 
+        return model_config_file_path
+
     # private methods
     def __qryrem(self, x):
         # for removing the "QRY:" token if they exist in passages
diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
index bf2bd5e0..37beaa3e 100644
--- a/tests/ml_models/test_sentencetransformermodel_pytest.py
+++ b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -5,6 +5,7 @@
 # Any modifications Copyright OpenSearch Contributors. See
 # GitHub history for details.
 
+import json
 import os
 import shutil
 
@@ -170,5 +171,286 @@ def test_save_as_onnx():
         assert False, f"Tracing model in ONNX raised an exception {exec}"
 
 
+def test_make_model_config_json_for_torch_script():
+    model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": True,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model5 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model5.make_model_config_json(
+        model_format="TORCH_SCRIPT"
+    )
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data_torch
+        and model_config_data_torch["name"] == model_id
+    ), "Missing or Wrong model name in torch script model config file"
+    assert (
+        "model_format" in model_config_data_torch
+        and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
+    )
+    assert (
+        "model_config" in model_config_data_torch
+    ), "Missing 'model_config' in torch script model config file"
+
+    for k, v in expected_model_config_data.items():
+        assert (
+            k in model_config_data_torch["model_config"]
+            and model_config_data_torch["model_config"][k] == v
+        ) or (
+            k not in model_config_data_torch["model_config"]
+            and k == "normalize_result"
+            and not v
+        )
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_make_model_config_json_for_onnx():
+    model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
+    expected_model_config_data = {
+        "embedding_dimension": 384,
+        "pooling_mode": "MEAN",
+        "normalize_result": False,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model6 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model6.save_as_onnx(model_id=model_id)
+    model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX")
+    try:
+        with open(model_config_path_onnx) as json_file:
+            model_config_data_onnx = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in onnx raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data_onnx and model_config_data_onnx["name"] == model_id
+    ), "Missing or Wrong model name in onnx model config file'"
+    assert (
+        "model_format" in model_config_data_onnx
+        and model_config_data_onnx["model_format"] == "ONNX"
+    )
+    assert (
+        "model_config" in model_config_data_onnx
+    ), "Missing 'model_config' in onnx model config file"
+
+    for k, v in expected_model_config_data.items():
+        assert (
+            k in model_config_data_onnx["model_config"]
+            and model_config_data_onnx["model_config"][k] == v
+        ) or (
+            k not in model_config_data_onnx["model_config"]
+            and k == "normalize_result"
+            and not v
+        )
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_overwrite_fields_in_model_config():
+    model_id = "sentence-transformers/all-distilroberta-v1"
+    expected_model_config_data = {
+        "embedding_dimension": 768,
+        "pooling_mode": "MEAN",
+        "normalize_result": True,
+    }
+
+    overwritten_model_config_data = {
+        "embedding_dimension": 128,
+        "pooling_mode": "MAX",
+        "normalize_result": False,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model7 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model7.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model7.make_model_config_json(
+        model_format="TORCH_SCRIPT"
+    )
+
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data_torch
+        and model_config_data_torch["name"] == model_id
+    ), "Missing or Wrong model name in torch script model config file"
+    assert (
+        "model_format" in model_config_data_torch
+        and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
+    )
+    assert (
+        "model_config" in model_config_data_torch
+    ), "Missing 'model_config' in torch script model config file"
+
+    for k, v in expected_model_config_data.items():
+        assert (
+            k in model_config_data_torch["model_config"]
+            and model_config_data_torch["model_config"][k] == v
+        ) or (
+            k not in model_config_data_torch["model_config"]
+            and k == "normalize_result"
+            and not v
+        )
+
+    clean_test_folder(TEST_FOLDER)
+    test_model8 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model8.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model8.make_model_config_json(
+        model_format="TORCH_SCRIPT",
+        embedding_dimension=overwritten_model_config_data["embedding_dimension"],
+        pooling_mode=overwritten_model_config_data["pooling_mode"],
+        normalize_result=overwritten_model_config_data["normalize_result"],
+    )
+
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data_torch
+        and model_config_data_torch["name"] == model_id
+    ), "Missing or Wrong model name in torch script model config file"
+    assert (
+        "model_format" in model_config_data_torch
+        and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
+    )
+    assert (
+        "model_config" in model_config_data_torch
+    ), "Missing 'model_config' in torch script model config file"
+
+    for k, v in overwritten_model_config_data.items():
+        assert (
+            k in model_config_data_torch["model_config"]
+            and model_config_data_torch["model_config"][k] == v
+        ) or (
+            k not in model_config_data_torch["model_config"]
+            and k == "normalize_result"
+            and not v
+        )
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_missing_fields_in_config_json():
+    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+    expected_model_config_data = {
+        "embedding_dimension": 768,
+        "normalize_result": False,
+    }
+
+    clean_test_folder(TEST_FOLDER)
+    test_model9 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+
+    pooling_json_file_path = os.path.join(TEST_FOLDER, "1_Pooling", "config.json")
+    try:
+        with open(pooling_json_file_path, "w") as f:
+            empty_dict = {}
+            json.dump(empty_dict, f)
+    except Exception as exec:
+        assert False, f"Modifying pooling json file raised an exception {exec}"
+
+    config_json_file_path = os.path.join(TEST_FOLDER, "config.json")
+    try:
+        with open(config_json_file_path, "r") as f:
+            config_content = json.load(f)
+            embedding_dimension_mapping_list = [
+                "dim",
+                "hidden_size",
+                "d_model",
+            ]
+            for mapping_item in embedding_dimension_mapping_list:
+                config_content.pop(mapping_item, None)
+
+        with open(config_json_file_path, "w") as f:
+            json.dump(config_content, f)
+    except Exception as exec:
+        assert False, f"Modifying config json file raised an exception {exec}"
+
+    model_config_path_torch = test_model9.make_model_config_json(
+        model_format="TORCH_SCRIPT", verbose=True
+    )
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
+
+    assert (
+        "name" in model_config_data_torch
+        and model_config_data_torch["name"] == model_id
+    ), "Missing or Wrong model name in torch script model config file"
+    assert (
+        "model_format" in model_config_data_torch
+        and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
+    )
+    assert (
+        "model_config" in model_config_data_torch
+    ), "Missing 'model_config' in torch script model config file"
+
+    for k, v in expected_model_config_data.items():
+        assert (
+            k in model_config_data_torch["model_config"]
+            and model_config_data_torch["model_config"][k] == v
+        ) or (
+            k not in model_config_data_torch["model_config"]
+            and k == "normalize_result"
+            and not v
+        ), "make_model_config_json() does not generate an expected model config"
+
+    assert (
+        "pooling_mode" not in model_config_data_torch
+    ), "make_model_config_json() does not generate an expected model config"
+
+    clean_test_folder(TEST_FOLDER)
+
+
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)