opensearch-project · dhrubo-os · Jul 18, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023
@@ -13,6 +13,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ### Fixed
 - Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185))
 - Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188))
+- Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191))
 
 ## [1.0.0]    
 
@@ -82,4 +83,4 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 
 [1.0.0]: https://github.com/opensearch-project/opensearch-py-ml/compare/1.0.0b1...1.0.0
-[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1
+[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1
@@ -24,6 +24,7 @@
 import yaml
 from accelerate import Accelerator, notebook_launcher
 from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import Normalize, Pooling, Transformer
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import TrainingArguments, get_linear_schedule_with_warmup
@@ -244,7 +245,7 @@
         within synthetic_queries/ folder, output as a dataframe
 
         :param read_path:
-            required, path to the zipped file that contains generated queries, if None, raise exception
+            required, path to the zipped file that contains generated queries
         :type read_path: string
         :param overwrite:
             optional, synthetic_queries/ folder in current directory is to store unzip queries files.
@@ -254,12 +255,6 @@
         :return: The dataframe of queries.
         :rtype: panda dataframe
         """
-
-        if read_path is None:
-            raise Exception(
-                "No file provided. Please provide the path to synthetic query zip file."
-            )
-
         # assign a local folder 'synthetic_queries/' to store the unzip file,
         # check if the folder contains sub-folders and files, remove and clean up the folder before unzip.
         # walk through the zip file and read the file paths into file_list
@@ -989,25 +984,24 @@
         """
         parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required
         fields are given by users, use the given parameters and will skip reading the config.json
-
         :param model_name:
-            Optional, The name of the model. If None, default to parse from model id, for example,
-            'msmarco-distilbert-base-tas-b'
+            Optional, The name of the model. If None, default is model id, for example,
+            'sentence-transformers/msmarco-distilbert-base-tas-b'
         :type model_name: string
         :param model_format:
             Optional, The format of the model. Default is "TORCH_SCRIPT".
         :type model_format: string
         :param version_number:
             Optional, The version number of the model. Default is 1
         :type version_number: string
-        :param embedding_dimension: Optional, the embedding dimension of the model. If None, parse embedding_dimension
-            from the config file of pre-trained hugging-face model. If not found, default to be 768
+        :param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension
+            from the pre-trained hugging-face model object.
         :type embedding_dimension: int
-        :param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode
-            from the config file of pre-trained hugging-face model. If not found, do not include it.
+        :param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode
+            from the pre-trained hugging-face model object.
         :type pooling_mode: string
-        :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder
-            exists in the pre-trained hugging-face model folder. If not found, do not include it.
+        :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained
+        hugging-face model object.
         :type normalize_result: bool
         :param all_config:
             Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained
@@ -1028,8 +1022,43 @@
         if model_name is None:
             model_name = self.model_id
 
-        # if user input model_type and embedding_dimension, it will skip reading the config.json file
-        if model_type is None or embedding_dimension is None:
+        # if user input model_type/embedding_dimension/pooling_mode, it will skip this step.
+        model = SentenceTransformer(self.model_id)
+        if (
+            model_type is None
+            or embedding_dimension is None
+            or pooling_mode is None
+            or normalize_result is None
+        ):
+            try:
+                if (
+                    model_type is None
+                    and len(model._modules) >= 1
+                    and isinstance(model._modules["0"], Transformer)
+                ):
+                    model_type = model._modules["0"].auto_model.__class__.__name__
+                    model_type = model_type.lower().rstrip("model")
+                if embedding_dimension is None:
+                    embedding_dimension = model.get_sentence_embedding_dimension()
+                if (
+                    pooling_mode is None
+                    and len(model._modules) >= 2
+                    and isinstance(model._modules["1"], Pooling)
+                ):
+                    pooling_mode = model._modules["1"].get_pooling_mode_str().upper()
+                if normalize_result is None:
+                    if len(model._modules) >= 3 and isinstance(
+                        model._modules["2"], Normalize
+                    ):
+                        normalize_result = True
+                    else:
+                        normalize_result = False
+            except Exception as e:
+                raise Exception(
+                    f"Raised exception while getting model data from pre-trained hugging-face model object: {e}"
+                )
+
+        if all_config is None:
             if not os.path.exists(config_json_file_path):
                 raise Exception(
                     str(
@@ -1045,37 +1074,6 @@
                     config_content = json.load(f)
                     if all_config is None:
                         all_config = config_content
-                    if model_type is None:
-                        if "model_type" in config_content.keys():
-                            model_type = config_content["model_type"]
-                        else:
-                            print(
-                                "Please check file or input model_type and embedding_dimension in the argument"
-                            )
-                            raise Exception(
-                                str(
-                                    "Cannot find model_type in config.json file"
-                                    + config_json_file_path
-                                    + ". Please check the config.son file in the path."
-                                )
-                            )
-                    if embedding_dimension is None:
-                        embedding_dimension_mapping_list = [
-                            "dim",
-                            "hidden_size",
-                            "d_model",
-                        ]
-                        for mapping_item in embedding_dimension_mapping_list:
-                            if mapping_item in config_content.keys():
-                                embedding_dimension = config_content[mapping_item]
-                                break
-                        else:
-                            print(
-                                'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ',
-                                config_json_file_path,
-                                ". Please add in the config file or input in the argument for embedding_dimension.",
-                            )
-                            embedding_dimension = 768
             except IOError:
                 print(
                     "Cannot open in config.json file at ",
@@ -1093,63 +1091,12 @@
                 "model_type": model_type,
                 "embedding_dimension": embedding_dimension,
                 "framework_type": "sentence_transformers",
+                "pooling_mode": pooling_mode,
+                "normalize_result": normalize_result,
                 "all_config": json.dumps(all_config),
             },
         }
 
-        if pooling_mode is not None:
-            model_config_content["model_config"]["pooling_mode"] = pooling_mode
-        else:
-            pooling_config_json_file_path = os.path.join(
-                folder_path, "1_Pooling", "config.json"
-            )
-            if os.path.exists(pooling_config_json_file_path):
-                try:
-                    with open(pooling_config_json_file_path) as f:
-                        if verbose:
-                            print(
-                                "reading pooling config file from: "
-                                + pooling_config_json_file_path
-                            )
-                        pooling_config_content = json.load(f)
-                        pooling_mode_mapping_dict = {
-                            "pooling_mode_cls_token": "CLS",
-                            "pooling_mode_mean_tokens": "MEAN",
-                            "pooling_mode_max_tokens": "MAX",
-                            "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN",
-                        }
-                        for mapping_item in pooling_mode_mapping_dict:
-                            if (
-                                mapping_item in pooling_config_content.keys()
-                                and pooling_config_content[mapping_item]
-                            ):
-                                pooling_mode = pooling_mode_mapping_dict[mapping_item]
-                                model_config_content["model_config"][
-                                    "pooling_mode"
-                                ] = pooling_mode
-                                break
-                        else:
-                            print(
-                                'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ',
-                                pooling_config_json_file_path,
-                                ". Please add in the pooling config file or input in the argument for pooling_mode.",
-                            )
-
-                except IOError:
-                    print(
-                        "Cannot open in config.json file at ",
-                        pooling_config_json_file_path,
-                        ". Please check the config.json ",
-                        "file in the path.",
-                    )
-
-        if normalize_result is not None:
-            model_config_content["model_config"]["normalize_result"] = normalize_result
-        else:
-            normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize")
-            if os.path.exists(normalize_result_json_file_path):
-                model_config_content["model_config"]["normalize_result"] = True
-
         if verbose:
             print("generating ml-commons_model_config.json file...\n")
             print(model_config_content)

@@ -303,6 +303,7 @@ def test_integration_model_train_register_full_cycle():
         zip_file_name=MODEL_FILE_ZIP_NAME,
         num_epochs=1,
         overwrite=True,
+        verbose=True,
     )
     # second generating the config file to create metadoc of the model in opensearch.
     test_model.make_model_config_json()

@@ -107,7 +107,7 @@ def test_missing_files():
     with pytest.raises(FileNotFoundError):
         test_model.read_queries(read_path="1234")
 
-        # test synthetic queries already exists in folder
+    # test synthetic queries already exists in folder
     with pytest.raises(Exception) as exc_info:
         temp_path = os.path.join(
             os.path.dirname(os.path.abspath("__file__")),
@@ -123,7 +123,7 @@ def test_missing_files():
 
     # test no tokenizer.json file
     with pytest.raises(Exception) as exc_info:
-        test_model.zip_model()
+        test_model.zip_model(verbose=True)
     assert "Cannot find tokenizer.json file" in str(exc_info.value)
 
     # test no model file
@@ -137,7 +137,7 @@ def test_missing_files():
         test_model3 = SentenceTransformerModel(folder_path=temp_path)
         test_model3.save_as_pt(sentences=["today is sunny"])
         os.remove(os.path.join(temp_path, "msmarco-distilbert-base-tas-b.pt"))
-        test_model3.zip_model()
+        test_model3.zip_model(verbose=True)
         clean_test_folder(temp_path)
     assert "Cannot find model in the model path" in str(exc_info.value)
 
@@ -187,7 +187,7 @@ def test_make_model_config_json_for_torch_script():
 
     test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
     model_config_path_torch = test_model5.make_model_config_json(
-        model_format="TORCH_SCRIPT"
+        model_format="TORCH_SCRIPT", verbose=True
     )
     try:
         with open(model_config_path_torch) as json_file:
@@ -372,85 +372,5 @@ def test_overwrite_fields_in_model_config():
     clean_test_folder(TEST_FOLDER)
 
 
-def test_missing_fields_in_config_json():
-    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
-    expected_model_config_data = {
-        "embedding_dimension": 768,
-        "normalize_result": False,
-    }
-
-    clean_test_folder(TEST_FOLDER)
-    test_model9 = SentenceTransformerModel(
-        folder_path=TEST_FOLDER,
-        model_id=model_id,
-    )
-
-    test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])
-
-    pooling_json_file_path = os.path.join(TEST_FOLDER, "1_Pooling", "config.json")
-    try:
-        with open(pooling_json_file_path, "w") as f:
-            empty_dict = {}
-            json.dump(empty_dict, f)
-    except Exception as exec:
-        assert False, f"Modifying pooling json file raised an exception {exec}"
-
-    config_json_file_path = os.path.join(TEST_FOLDER, "config.json")
-    try:
-        with open(config_json_file_path, "r") as f:
-            config_content = json.load(f)
-            embedding_dimension_mapping_list = [
-                "dim",
-                "hidden_size",
-                "d_model",
-            ]
-            for mapping_item in embedding_dimension_mapping_list:
-                config_content.pop(mapping_item, None)
-
-        with open(config_json_file_path, "w") as f:
-            json.dump(config_content, f)
-    except Exception as exec:
-        assert False, f"Modifying config json file raised an exception {exec}"
-
-    model_config_path_torch = test_model9.make_model_config_json(
-        model_format="TORCH_SCRIPT", verbose=True
-    )
-    try:
-        with open(model_config_path_torch) as json_file:
-            model_config_data_torch = json.load(json_file)
-    except Exception as exec:
-        assert (
-            False
-        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
-
-    assert (
-        "name" in model_config_data_torch
-        and model_config_data_torch["name"] == model_id
-    ), "Missing or Wrong model name in torch script model config file"
-    assert (
-        "model_format" in model_config_data_torch
-        and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
-    )
-    assert (
-        "model_config" in model_config_data_torch
-    ), "Missing 'model_config' in torch script model config file"
-
-    for k, v in expected_model_config_data.items():
-        assert (
-            k in model_config_data_torch["model_config"]
-            and model_config_data_torch["model_config"][k] == v
-        ) or (
-            k not in model_config_data_torch["model_config"]
-            and k == "normalize_result"
-            and not v
-        ), "make_model_config_json() does not generate an expected model config"
-
-    assert (
-        "pooling_mode" not in model_config_data_torch
-    ), "make_model_config_json() does not generate an expected model config"
-
-    clean_test_folder(TEST_FOLDER)
-
-
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)