Merge pull request #176 from thanawan-atc/add_description_config_json

Add description config json
thanawan-atc · Aug 9, 2023 · 3b20dac · 3b20dac
2 parents 2012aab + f5f7552
commit 3b20dac
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,15 @@
 # CHANGELOG
 Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
+## [1.2.0]
+
+### Added
+
+### Changed
+
+### Fixed
+- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
+
 ## [1.1.0]
 
 ### Added
@@ -33,7 +42,6 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191))
 - Enabled auto-truncation for any pretrained models by @Yerzhaisang in ([#192](https://github.com/opensearch-project/opensearch-py-ml/pull/192))
 - Generalize make_model_config_json function by @thanawan-atc in ([#200](https://github.com/opensearch-project/opensearch-py-ml/pull/200))
-- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
 
 ## [1.0.0]    
 

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -1008,34 +1008,72 @@ def set_up_accelerate_config(
                 "Failed to open config file for ml common upload: " + file_path + "\n"
             )
 
-    def get_model_description_from_md_file(self, readme_file_path) -> str:
+    def _get_model_description_from_readme_file(self, readme_file_path) -> str:
         """
-        Get description of the model from README.md file
+        Get description of the model from README.md file in the model folder
         after the model is saved in local directory
 
+        See example here:
+        https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/blob/main/README.md)
+
+        This function assumes that the README.md has the following format:
+
+        # sentence-transformers/msmarco-distilbert-base-tas-b
+        This is [ ... further description ... ]
+
+        # [ ... Next section ...]
+        ...
+
         :param readme_file_path: Path to README.md file
         :type readme_file_path: string
         :return: Description of the model
         :rtype: string
         """
         readme_data = MarkDownFile.read_file(readme_file_path)
+
+        # Find the description section
         start_str = f"# {self.model_id}"
         start = readme_data.find(start_str)
         if start == -1:
             model_name = self.model_id.split("/")[1]
             start_str = f"# {model_name}"
             start = readme_data.find(start_str)
-        end = readme_data.find("## ", start)
+        end = readme_data.find("\n#", start + len(start_str))
+
+        # If we cannot find the scope of description section, raise error.
         if start == -1 or end == -1:
             assert False, "Cannot find description in README.md file"
 
+        # Parse out the description section
         description = readme_data[start + len(start_str) + 1 : end].strip()
+        description = description.split("\n")[0]
+
+        # Remove hyperlink and reformat text
         description = re.sub(r"\(.*?\)", "", description)
         description = re.sub(r"[\[\]]", "", description)
         description = re.sub(r"\*", "", description)
+
+        # Remove unnecessary part if exists (i.e. " For an introduction to ...")
+        # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md)
         unnecessary_part = description.find(" For an introduction to")
         if unnecessary_part != -1:
             description = description[:unnecessary_part]
+
+        return description
+
+    def _generate_default_model_description(self, embedding_dimension) -> str:
+        """
+        Generate default model description of the model based on embedding_dimension
+
+        ::param embedding_dimension: Embedding dimension of the model.
+        :type embedding_dimension: int
+        :return: Description of the model
+        :rtype: string
+        """
+        print(
+            "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function"
+        )
+        description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space."
         return description
 
     def make_model_config_json(
@@ -1129,11 +1167,19 @@ def make_model_config_json(
                 try:
                     if verbose:
                         print("reading README.md file")
-                    description = self.get_model_description_from_md_file(
+                    description = self._get_model_description_from_readme_file(
                         readme_file_path
                     )
                 except Exception as e:
-                    print(f"Cannot get model description from README.md file: {e}")
+                    print(f"Cannot scrape model description from README.md file: {e}")
+                    description = self._generate_default_model_description(
+                        embedding_dimension
+                    )
+            else:
+                print("Cannot find README.md file to scrape model description")
+                description = self._generate_default_model_description(
+                    embedding_dimension
+                )
 
         if all_config is None:
             if not os.path.exists(config_json_file_path):
@@ -1162,6 +1208,7 @@ def make_model_config_json(
         model_config_content = {
             "name": model_name,
             "version": version_number,
+            "description": description,
             "model_format": model_format,
             "model_task_type": "TEXT_EMBEDDING",
             "model_config": {
@@ -1174,9 +1221,6 @@ def make_model_config_json(
             },
         }
 
-        if description is not None:
-            model_config_content["description"] = description
-
         if verbose:
             print("generating ml-commons_model_config.json file...\n")
             print(model_config_content)

diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -262,7 +262,7 @@ def test_make_model_config_json_for_onnx():
         "description" in model_config_data_onnx
         and model_config_data_onnx["description"]
         == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search."
-    ), "Missing or Wrong model description in onnx model config file'"
+    ), "Missing or Wrong model description in onnx model config file"
     assert (
         "model_config" in model_config_data_onnx
     ), "Missing 'model_config' in onnx model config file"
@@ -408,14 +408,16 @@ def test_missing_readme_md_file():
         ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
 
     assert (
-        "description" not in model_config_data_torch
-    ), "Should not have description in model config file"
+        "description" in model_config_data_torch
+        and model_config_data_torch["description"]
+        == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space."
+    ), "Should use default model description when README.md file is missing"
 
     clean_test_folder(TEST_FOLDER)
 
 
-def test_missing_description_in_readme_file():
-    model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
+def test_missing_expected_description_in_readme_file():
+    model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
     clean_test_folder(TEST_FOLDER)
     test_model10 = SentenceTransformerModel(
         folder_path=TEST_FOLDER,
@@ -441,8 +443,10 @@ def test_missing_description_in_readme_file():
         ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
 
     assert (
-        "description" not in model_config_data_torch
-    ), "Should not have description in model config file"
+        "description" in model_config_data_torch
+        and model_config_data_torch["description"]
+        == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space."
+    ), "Should use default model description when description is missing from README.md"
 
     clean_test_folder(TEST_FOLDER)
 
@@ -475,17 +479,46 @@ def test_overwrite_description():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_long_description():
+    model_id = "sentence-transformers/gtr-t5-base"
+    clean_test_folder(TEST_FOLDER)
+    test_model12 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    model_config_path_torch = test_model12.make_model_config_json(
+        model_format="TORCH_SCRIPT"
+    )
+    try:
+        with open(model_config_path_torch) as json_file:
+            model_config_data_torch = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating model config file for tracing in torch_script raised an exception {exec}"
+
+    assert (
+        "description" in model_config_data_torch
+        and model_config_data_torch["description"]
+        == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search."
+    ), "Missing or Wrong model description in torch_script model config file"
+
+    clean_test_folder(TEST_FOLDER)
+
+
 def test_truncation_parameter():
     model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
     MAX_LENGTH_TASB = 512
 
     clean_test_folder(TEST_FOLDER)
-    test_model12 = SentenceTransformerModel(
+    test_model13 = SentenceTransformerModel(
         folder_path=TEST_FOLDER,
         model_id=model_id,
     )
 
-    test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+    test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"])
 
     tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
     try: