diff --git a/CHANGELOG.md b/CHANGELOG.md index 22d88f3b..7bc2109a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ # CHANGELOG Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [1.2.0] + +### Added + +### Changed + +### Fixed +- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) + ## [1.1.0] ### Added @@ -33,7 +42,6 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191)) - Enabled auto-truncation for any pretrained models by @Yerzhaisang in ([#192](https://github.com/opensearch-project/opensearch-py-ml/pull/192)) - Generalize make_model_config_json function by @thanawan-atc in ([#200](https://github.com/opensearch-project/opensearch-py-ml/pull/200)) -- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) ## [1.0.0] diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index de650a24..05db5270 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1008,34 +1008,72 @@ def set_up_accelerate_config( "Failed to open config file for ml common upload: " + file_path + "\n" ) - def get_model_description_from_md_file(self, readme_file_path) -> str: + def _get_model_description_from_readme_file(self, readme_file_path) -> str: """ - Get description of the model from README.md file + Get description of the model from README.md file in the model folder after the model is saved in local directory + See example here: + https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/blob/main/README.md) + + This function assumes that the README.md has the following format: + + # sentence-transformers/msmarco-distilbert-base-tas-b + This is [ ... further description ... ] + + # [ ... Next section ...] + ... + :param readme_file_path: Path to README.md file :type readme_file_path: string :return: Description of the model :rtype: string """ readme_data = MarkDownFile.read_file(readme_file_path) + + # Find the description section start_str = f"# {self.model_id}" start = readme_data.find(start_str) if start == -1: model_name = self.model_id.split("/")[1] start_str = f"# {model_name}" start = readme_data.find(start_str) - end = readme_data.find("## ", start) + end = readme_data.find("\n#", start + len(start_str)) + + # If we cannot find the scope of description section, raise error. if start == -1 or end == -1: assert False, "Cannot find description in README.md file" + # Parse out the description section description = readme_data[start + len(start_str) + 1 : end].strip() + description = description.split("\n")[0] + + # Remove hyperlink and reformat text description = re.sub(r"\(.*?\)", "", description) description = re.sub(r"[\[\]]", "", description) description = re.sub(r"\*", "", description) + + # Remove unnecessary part if exists (i.e. " For an introduction to ...") + # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md) unnecessary_part = description.find(" For an introduction to") if unnecessary_part != -1: description = description[:unnecessary_part] + + return description + + def _generate_default_model_description(self, embedding_dimension) -> str: + """ + Generate default model description of the model based on embedding_dimension + + ::param embedding_dimension: Embedding dimension of the model. + :type embedding_dimension: int + :return: Description of the model + :rtype: string + """ + print( + "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function" + ) + description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space." return description def make_model_config_json( @@ -1129,11 +1167,19 @@ def make_model_config_json( try: if verbose: print("reading README.md file") - description = self.get_model_description_from_md_file( + description = self._get_model_description_from_readme_file( readme_file_path ) except Exception as e: - print(f"Cannot get model description from README.md file: {e}") + print(f"Cannot scrape model description from README.md file: {e}") + description = self._generate_default_model_description( + embedding_dimension + ) + else: + print("Cannot find README.md file to scrape model description") + description = self._generate_default_model_description( + embedding_dimension + ) if all_config is None: if not os.path.exists(config_json_file_path): @@ -1162,6 +1208,7 @@ def make_model_config_json( model_config_content = { "name": model_name, "version": version_number, + "description": description, "model_format": model_format, "model_task_type": "TEXT_EMBEDDING", "model_config": { @@ -1174,9 +1221,6 @@ def make_model_config_json( }, } - if description is not None: - model_config_content["description"] = description - if verbose: print("generating ml-commons_model_config.json file...\n") print(model_config_content) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 63c7c515..7bf0c95b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -262,7 +262,7 @@ def test_make_model_config_json_for_onnx(): "description" in model_config_data_onnx and model_config_data_onnx["description"] == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." - ), "Missing or Wrong model description in onnx model config file'" + ), "Missing or Wrong model description in onnx model config file" assert ( "model_config" in model_config_data_onnx ), "Missing 'model_config' in onnx model config file" @@ -408,14 +408,16 @@ def test_missing_readme_md_file(): ), f"Creating model config file for tracing in torch_script raised an exception {exec}" assert ( - "description" not in model_config_data_torch - ), "Should not have description in model config file" + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space." + ), "Should use default model description when README.md file is missing" clean_test_folder(TEST_FOLDER) -def test_missing_description_in_readme_file(): - model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" +def test_missing_expected_description_in_readme_file(): + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" clean_test_folder(TEST_FOLDER) test_model10 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -441,8 +443,10 @@ def test_missing_description_in_readme_file(): ), f"Creating model config file for tracing in torch_script raised an exception {exec}" assert ( - "description" not in model_config_data_torch - ), "Should not have description in model config file" + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + ), "Should use default model description when description is missing from README.md" clean_test_folder(TEST_FOLDER) @@ -475,17 +479,46 @@ def test_overwrite_description(): clean_test_folder(TEST_FOLDER) +def test_long_description(): + model_id = "sentence-transformers/gtr-t5-base" + clean_test_folder(TEST_FOLDER) + test_model12 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model12.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + ), "Missing or Wrong model description in torch_script model config file" + + clean_test_folder(TEST_FOLDER) + + def test_truncation_parameter(): model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" MAX_LENGTH_TASB = 512 clean_test_folder(TEST_FOLDER) - test_model12 = SentenceTransformerModel( + test_model13 = SentenceTransformerModel( folder_path=TEST_FOLDER, model_id=model_id, ) - test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: