Skip to content

Commit

Permalink
Make make_model_config_json function more concise (#191)
Browse files Browse the repository at this point in the history
* Edit make_model_config_json function

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

* Improve cov

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

* Improve cov

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

* Update CHANGELOG.md

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

* Improve cov (2)

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

* Correct linting

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>

---------

Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>
  • Loading branch information
thanawan-atc authored Jul 18, 2023
1 parent 8784cc4 commit 7622af5
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 187 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
### Fixed
- Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185))
- Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188))
- Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191))

## [1.0.0]

Expand Down Expand Up @@ -82,4 +83,4 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)


[1.0.0]: https://github.com/opensearch-project/opensearch-py-ml/compare/1.0.0b1...1.0.0
[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1
[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1
151 changes: 49 additions & 102 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import yaml
from accelerate import Accelerator, notebook_launcher
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Normalize, Pooling, Transformer
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import TrainingArguments, get_linear_schedule_with_warmup
Expand Down Expand Up @@ -244,7 +245,7 @@ def read_queries(self, read_path: str, overwrite: bool = False) -> pd.DataFrame:
within synthetic_queries/ folder, output as a dataframe
:param read_path:
required, path to the zipped file that contains generated queries, if None, raise exception
required, path to the zipped file that contains generated queries
:type read_path: string
:param overwrite:
optional, synthetic_queries/ folder in current directory is to store unzip queries files.
Expand All @@ -254,12 +255,6 @@ def read_queries(self, read_path: str, overwrite: bool = False) -> pd.DataFrame:
:return: The dataframe of queries.
:rtype: panda dataframe
"""

if read_path is None:
raise Exception(
"No file provided. Please provide the path to synthetic query zip file."
)

# assign a local folder 'synthetic_queries/' to store the unzip file,
# check if the folder contains sub-folders and files, remove and clean up the folder before unzip.
# walk through the zip file and read the file paths into file_list
Expand Down Expand Up @@ -989,25 +984,24 @@ def make_model_config_json(
"""
parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required
fields are given by users, use the given parameters and will skip reading the config.json
:param model_name:
Optional, The name of the model. If None, default to parse from model id, for example,
'msmarco-distilbert-base-tas-b'
Optional, The name of the model. If None, default is model id, for example,
'sentence-transformers/msmarco-distilbert-base-tas-b'
:type model_name: string
:param model_format:
Optional, The format of the model. Default is "TORCH_SCRIPT".
:type model_format: string
:param version_number:
Optional, The version number of the model. Default is 1
:type version_number: string
:param embedding_dimension: Optional, the embedding dimension of the model. If None, parse embedding_dimension
from the config file of pre-trained hugging-face model. If not found, default to be 768
:param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension
from the pre-trained hugging-face model object.
:type embedding_dimension: int
:param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode
from the config file of pre-trained hugging-face model. If not found, do not include it.
:param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode
from the pre-trained hugging-face model object.
:type pooling_mode: string
:param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder
exists in the pre-trained hugging-face model folder. If not found, do not include it.
:param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained
hugging-face model object.
:type normalize_result: bool
:param all_config:
Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained
Expand All @@ -1028,8 +1022,43 @@ def make_model_config_json(
if model_name is None:
model_name = self.model_id

# if user input model_type and embedding_dimension, it will skip reading the config.json file
if model_type is None or embedding_dimension is None:
# if user input model_type/embedding_dimension/pooling_mode, it will skip this step.
model = SentenceTransformer(self.model_id)
if (
model_type is None
or embedding_dimension is None
or pooling_mode is None
or normalize_result is None
):
try:
if (
model_type is None
and len(model._modules) >= 1
and isinstance(model._modules["0"], Transformer)
):
model_type = model._modules["0"].auto_model.__class__.__name__
model_type = model_type.lower().rstrip("model")
if embedding_dimension is None:
embedding_dimension = model.get_sentence_embedding_dimension()
if (
pooling_mode is None
and len(model._modules) >= 2
and isinstance(model._modules["1"], Pooling)
):
pooling_mode = model._modules["1"].get_pooling_mode_str().upper()
if normalize_result is None:
if len(model._modules) >= 3 and isinstance(
model._modules["2"], Normalize
):
normalize_result = True
else:
normalize_result = False
except Exception as e:
raise Exception(
f"Raised exception while getting model data from pre-trained hugging-face model object: {e}"
)

if all_config is None:
if not os.path.exists(config_json_file_path):
raise Exception(
str(
Expand All @@ -1045,37 +1074,6 @@ def make_model_config_json(
config_content = json.load(f)
if all_config is None:
all_config = config_content
if model_type is None:
if "model_type" in config_content.keys():
model_type = config_content["model_type"]
else:
print(
"Please check file or input model_type and embedding_dimension in the argument"
)
raise Exception(
str(
"Cannot find model_type in config.json file"
+ config_json_file_path
+ ". Please check the config.son file in the path."
)
)
if embedding_dimension is None:
embedding_dimension_mapping_list = [
"dim",
"hidden_size",
"d_model",
]
for mapping_item in embedding_dimension_mapping_list:
if mapping_item in config_content.keys():
embedding_dimension = config_content[mapping_item]
break
else:
print(
'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ',
config_json_file_path,
". Please add in the config file or input in the argument for embedding_dimension.",
)
embedding_dimension = 768
except IOError:
print(
"Cannot open in config.json file at ",
Expand All @@ -1093,63 +1091,12 @@ def make_model_config_json(
"model_type": model_type,
"embedding_dimension": embedding_dimension,
"framework_type": "sentence_transformers",
"pooling_mode": pooling_mode,
"normalize_result": normalize_result,
"all_config": json.dumps(all_config),
},
}

if pooling_mode is not None:
model_config_content["model_config"]["pooling_mode"] = pooling_mode
else:
pooling_config_json_file_path = os.path.join(
folder_path, "1_Pooling", "config.json"
)
if os.path.exists(pooling_config_json_file_path):
try:
with open(pooling_config_json_file_path) as f:
if verbose:
print(
"reading pooling config file from: "
+ pooling_config_json_file_path
)
pooling_config_content = json.load(f)
pooling_mode_mapping_dict = {
"pooling_mode_cls_token": "CLS",
"pooling_mode_mean_tokens": "MEAN",
"pooling_mode_max_tokens": "MAX",
"pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN",
}
for mapping_item in pooling_mode_mapping_dict:
if (
mapping_item in pooling_config_content.keys()
and pooling_config_content[mapping_item]
):
pooling_mode = pooling_mode_mapping_dict[mapping_item]
model_config_content["model_config"][
"pooling_mode"
] = pooling_mode
break
else:
print(
'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ',
pooling_config_json_file_path,
". Please add in the pooling config file or input in the argument for pooling_mode.",
)

except IOError:
print(
"Cannot open in config.json file at ",
pooling_config_json_file_path,
". Please check the config.json ",
"file in the path.",
)

if normalize_result is not None:
model_config_content["model_config"]["normalize_result"] = normalize_result
else:
normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize")
if os.path.exists(normalize_result_json_file_path):
model_config_content["model_config"]["normalize_result"] = True

if verbose:
print("generating ml-commons_model_config.json file...\n")
print(model_config_content)
Expand Down
1 change: 1 addition & 0 deletions tests/ml_commons/test_ml_commons_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def test_integration_model_train_register_full_cycle():
zip_file_name=MODEL_FILE_ZIP_NAME,
num_epochs=1,
overwrite=True,
verbose=True,
)
# second generating the config file to create metadoc of the model in opensearch.
test_model.make_model_config_json()
Expand Down
88 changes: 4 additions & 84 deletions tests/ml_models/test_sentencetransformermodel_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_missing_files():
with pytest.raises(FileNotFoundError):
test_model.read_queries(read_path="1234")

# test synthetic queries already exists in folder
# test synthetic queries already exists in folder
with pytest.raises(Exception) as exc_info:
temp_path = os.path.join(
os.path.dirname(os.path.abspath("__file__")),
Expand All @@ -123,7 +123,7 @@ def test_missing_files():

# test no tokenizer.json file
with pytest.raises(Exception) as exc_info:
test_model.zip_model()
test_model.zip_model(verbose=True)
assert "Cannot find tokenizer.json file" in str(exc_info.value)

# test no model file
Expand All @@ -137,7 +137,7 @@ def test_missing_files():
test_model3 = SentenceTransformerModel(folder_path=temp_path)
test_model3.save_as_pt(sentences=["today is sunny"])
os.remove(os.path.join(temp_path, "msmarco-distilbert-base-tas-b.pt"))
test_model3.zip_model()
test_model3.zip_model(verbose=True)
clean_test_folder(temp_path)
assert "Cannot find model in the model path" in str(exc_info.value)

Expand Down Expand Up @@ -187,7 +187,7 @@ def test_make_model_config_json_for_torch_script():

test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"])
model_config_path_torch = test_model5.make_model_config_json(
model_format="TORCH_SCRIPT"
model_format="TORCH_SCRIPT", verbose=True
)
try:
with open(model_config_path_torch) as json_file:
Expand Down Expand Up @@ -372,85 +372,5 @@ def test_overwrite_fields_in_model_config():
clean_test_folder(TEST_FOLDER)


def test_missing_fields_in_config_json():
model_id = "sentence-transformers/msmarco-distilbert-base-tas-b"
expected_model_config_data = {
"embedding_dimension": 768,
"normalize_result": False,
}

clean_test_folder(TEST_FOLDER)
test_model9 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"])

pooling_json_file_path = os.path.join(TEST_FOLDER, "1_Pooling", "config.json")
try:
with open(pooling_json_file_path, "w") as f:
empty_dict = {}
json.dump(empty_dict, f)
except Exception as exec:
assert False, f"Modifying pooling json file raised an exception {exec}"

config_json_file_path = os.path.join(TEST_FOLDER, "config.json")
try:
with open(config_json_file_path, "r") as f:
config_content = json.load(f)
embedding_dimension_mapping_list = [
"dim",
"hidden_size",
"d_model",
]
for mapping_item in embedding_dimension_mapping_list:
config_content.pop(mapping_item, None)

with open(config_json_file_path, "w") as f:
json.dump(config_content, f)
except Exception as exec:
assert False, f"Modifying config json file raised an exception {exec}"

model_config_path_torch = test_model9.make_model_config_json(
model_format="TORCH_SCRIPT", verbose=True
)
try:
with open(model_config_path_torch) as json_file:
model_config_data_torch = json.load(json_file)
except Exception as exec:
assert (
False
), f"Creating model config file for tracing in torch_script raised an exception {exec}"

assert (
"name" in model_config_data_torch
and model_config_data_torch["name"] == model_id
), "Missing or Wrong model name in torch script model config file"
assert (
"model_format" in model_config_data_torch
and model_config_data_torch["model_format"] == "TORCH_SCRIPT"
)
assert (
"model_config" in model_config_data_torch
), "Missing 'model_config' in torch script model config file"

for k, v in expected_model_config_data.items():
assert (
k in model_config_data_torch["model_config"]
and model_config_data_torch["model_config"][k] == v
) or (
k not in model_config_data_torch["model_config"]
and k == "normalize_result"
and not v
), "make_model_config_json() does not generate an expected model config"

assert (
"pooling_mode" not in model_config_data_torch
), "make_model_config_json() does not generate an expected model config"

clean_test_folder(TEST_FOLDER)


clean_test_folder(TEST_FOLDER)
clean_test_folder(TESTDATA_UNZIP_FOLDER)

0 comments on commit 7622af5

Please sign in to comment.