Skip to content

Commit

Permalink
Merge pull request #176 from thanawan-atc/add_description_config_json
Browse files Browse the repository at this point in the history
Add description config json
  • Loading branch information
thanawan-atc authored Aug 9, 2023
2 parents 2012aab + f5f7552 commit 3b20dac
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 18 deletions.
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# CHANGELOG
Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

## [1.2.0]

### Added

### Changed

### Fixed
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

## [1.1.0]

### Added
Expand Down Expand Up @@ -33,7 +42,6 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191))
- Enabled auto-truncation for any pretrained models by @Yerzhaisang in ([#192](https://github.com/opensearch-project/opensearch-py-ml/pull/192))
- Generalize make_model_config_json function by @thanawan-atc in ([#200](https://github.com/opensearch-project/opensearch-py-ml/pull/200))
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

## [1.0.0]

Expand Down
60 changes: 52 additions & 8 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,34 +1008,72 @@ def set_up_accelerate_config(
"Failed to open config file for ml common upload: " + file_path + "\n"
)

def get_model_description_from_md_file(self, readme_file_path) -> str:
def _get_model_description_from_readme_file(self, readme_file_path) -> str:
"""
Get description of the model from README.md file
Get description of the model from README.md file in the model folder
after the model is saved in local directory
See example here:
https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/blob/main/README.md)
This function assumes that the README.md has the following format:
# sentence-transformers/msmarco-distilbert-base-tas-b
This is [ ... further description ... ]
# [ ... Next section ...]
...
:param readme_file_path: Path to README.md file
:type readme_file_path: string
:return: Description of the model
:rtype: string
"""
readme_data = MarkDownFile.read_file(readme_file_path)

# Find the description section
start_str = f"# {self.model_id}"
start = readme_data.find(start_str)
if start == -1:
model_name = self.model_id.split("/")[1]
start_str = f"# {model_name}"
start = readme_data.find(start_str)
end = readme_data.find("## ", start)
end = readme_data.find("\n#", start + len(start_str))

# If we cannot find the scope of description section, raise error.
if start == -1 or end == -1:
assert False, "Cannot find description in README.md file"

# Parse out the description section
description = readme_data[start + len(start_str) + 1 : end].strip()
description = description.split("\n")[0]

# Remove hyperlink and reformat text
description = re.sub(r"\(.*?\)", "", description)
description = re.sub(r"[\[\]]", "", description)
description = re.sub(r"\*", "", description)

# Remove unnecessary part if exists (i.e. " For an introduction to ...")
# (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md)
unnecessary_part = description.find(" For an introduction to")
if unnecessary_part != -1:
description = description[:unnecessary_part]

return description

def _generate_default_model_description(self, embedding_dimension) -> str:
"""
Generate default model description of the model based on embedding_dimension
::param embedding_dimension: Embedding dimension of the model.
:type embedding_dimension: int
:return: Description of the model
:rtype: string
"""
print(
"Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function"
)
description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space."
return description

def make_model_config_json(
Expand Down Expand Up @@ -1129,11 +1167,19 @@ def make_model_config_json(
try:
if verbose:
print("reading README.md file")
description = self.get_model_description_from_md_file(
description = self._get_model_description_from_readme_file(
readme_file_path
)
except Exception as e:
print(f"Cannot get model description from README.md file: {e}")
print(f"Cannot scrape model description from README.md file: {e}")
description = self._generate_default_model_description(
embedding_dimension
)
else:
print("Cannot find README.md file to scrape model description")
description = self._generate_default_model_description(
embedding_dimension
)

if all_config is None:
if not os.path.exists(config_json_file_path):
Expand Down Expand Up @@ -1162,6 +1208,7 @@ def make_model_config_json(
model_config_content = {
"name": model_name,
"version": version_number,
"description": description,
"model_format": model_format,
"model_task_type": "TEXT_EMBEDDING",
"model_config": {
Expand All @@ -1174,9 +1221,6 @@ def make_model_config_json(
},
}

if description is not None:
model_config_content["description"] = description

if verbose:
print("generating ml-commons_model_config.json file...\n")
print(model_config_content)
Expand Down
51 changes: 42 additions & 9 deletions tests/ml_models/test_sentencetransformermodel_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def test_make_model_config_json_for_onnx():
"description" in model_config_data_onnx
and model_config_data_onnx["description"]
== "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search."
), "Missing or Wrong model description in onnx model config file'"
), "Missing or Wrong model description in onnx model config file"
assert (
"model_config" in model_config_data_onnx
), "Missing 'model_config' in onnx model config file"
Expand Down Expand Up @@ -408,14 +408,16 @@ def test_missing_readme_md_file():
), f"Creating model config file for tracing in torch_script raised an exception {exec}"

assert (
"description" not in model_config_data_torch
), "Should not have description in model config file"
"description" in model_config_data_torch
and model_config_data_torch["description"]
== "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space."
), "Should use default model description when README.md file is missing"

clean_test_folder(TEST_FOLDER)


def test_missing_description_in_readme_file():
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
def test_missing_expected_description_in_readme_file():
model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
clean_test_folder(TEST_FOLDER)
test_model10 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
Expand All @@ -441,8 +443,10 @@ def test_missing_description_in_readme_file():
), f"Creating model config file for tracing in torch_script raised an exception {exec}"

assert (
"description" not in model_config_data_torch
), "Should not have description in model config file"
"description" in model_config_data_torch
and model_config_data_torch["description"]
== "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space."
), "Should use default model description when description is missing from README.md"

clean_test_folder(TEST_FOLDER)

Expand Down Expand Up @@ -475,17 +479,46 @@ def test_overwrite_description():
clean_test_folder(TEST_FOLDER)


def test_long_description():
model_id = "sentence-transformers/gtr-t5-base"
clean_test_folder(TEST_FOLDER)
test_model12 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
model_config_path_torch = test_model12.make_model_config_json(
model_format="TORCH_SCRIPT"
)
try:
with open(model_config_path_torch) as json_file:
model_config_data_torch = json.load(json_file)
except Exception as exec:
assert (
False
), f"Creating model config file for tracing in torch_script raised an exception {exec}"

assert (
"description" in model_config_data_torch
and model_config_data_torch["description"]
== "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search."
), "Missing or Wrong model description in torch_script model config file"

clean_test_folder(TEST_FOLDER)


def test_truncation_parameter():
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
MAX_LENGTH_TASB = 512

clean_test_folder(TEST_FOLDER)
test_model12 = SentenceTransformerModel(
test_model13 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"])
test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"])

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
Expand Down

0 comments on commit 3b20dac

Please sign in to comment.