diff --git a/CHANGELOG.md b/CHANGELOG.md index f626ccc3..ccd3e535 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Fixed - Fix ModelUploader bug & Update model tracing demo notebook by @thanawan-atc in ([#185](https://github.com/opensearch-project/opensearch-py-ml/pull/185)) - Fix make_model_config_json function by @thanawan-atc in ([#188](https://github.com/opensearch-project/opensearch-py-ml/pull/188)) +- Make make_model_config_json function more concise by @thanawan-atc in ([#191](https://github.com/opensearch-project/opensearch-py-ml/pull/191)) ## [1.0.0] @@ -82,4 +83,4 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) [1.0.0]: https://github.com/opensearch-project/opensearch-py-ml/compare/1.0.0b1...1.0.0 -[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1 \ No newline at end of file +[1.0.0b1]: https://github.com/opensearch-project/opensearch-py-ml/commits/1.0.0b1 diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index e22b574f..057cfc4e 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -24,6 +24,7 @@ import yaml from accelerate import Accelerator, notebook_launcher from sentence_transformers import SentenceTransformer +from sentence_transformers.models import Normalize, Pooling, Transformer from torch.utils.data import DataLoader from tqdm import tqdm from transformers import TrainingArguments, get_linear_schedule_with_warmup @@ -244,7 +245,7 @@ def read_queries(self, read_path: str, overwrite: bool = False) -> pd.DataFrame: within synthetic_queries/ folder, output as a dataframe :param read_path: - required, path to the zipped file that contains generated queries, if None, raise exception + required, path to the zipped file that contains generated queries :type read_path: string :param overwrite: optional, synthetic_queries/ folder in current directory is to store unzip queries files. @@ -254,12 +255,6 @@ def read_queries(self, read_path: str, overwrite: bool = False) -> pd.DataFrame: :return: The dataframe of queries. :rtype: panda dataframe """ - - if read_path is None: - raise Exception( - "No file provided. Please provide the path to synthetic query zip file." - ) - # assign a local folder 'synthetic_queries/' to store the unzip file, # check if the folder contains sub-folders and files, remove and clean up the folder before unzip. # walk through the zip file and read the file paths into file_list @@ -989,10 +984,9 @@ def make_model_config_json( """ parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required fields are given by users, use the given parameters and will skip reading the config.json - :param model_name: - Optional, The name of the model. If None, default to parse from model id, for example, - 'msmarco-distilbert-base-tas-b' + Optional, The name of the model. If None, default is model id, for example, + 'sentence-transformers/msmarco-distilbert-base-tas-b' :type model_name: string :param model_format: Optional, The format of the model. Default is "TORCH_SCRIPT". @@ -1000,14 +994,14 @@ def make_model_config_json( :param version_number: Optional, The version number of the model. Default is 1 :type version_number: string - :param embedding_dimension: Optional, the embedding dimension of the model. If None, parse embedding_dimension - from the config file of pre-trained hugging-face model. If not found, default to be 768 + :param embedding_dimension: Optional, the embedding dimension of the model. If None, get embedding_dimension + from the pre-trained hugging-face model object. :type embedding_dimension: int - :param pooling_mode: Optional, the pooling mode of the model. If None, parse pooling_mode - from the config file of pre-trained hugging-face model. If not found, do not include it. + :param pooling_mode: Optional, the pooling mode of the model. If None, get pooling_mode + from the pre-trained hugging-face model object. :type pooling_mode: string - :param normalize_result: Optional, whether to normalize the result of the model. If None, check if 2_Normalize folder - exists in the pre-trained hugging-face model folder. If not found, do not include it. + :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained + hugging-face model object. :type normalize_result: bool :param all_config: Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained @@ -1028,8 +1022,43 @@ def make_model_config_json( if model_name is None: model_name = self.model_id - # if user input model_type and embedding_dimension, it will skip reading the config.json file - if model_type is None or embedding_dimension is None: + # if user input model_type/embedding_dimension/pooling_mode, it will skip this step. + model = SentenceTransformer(self.model_id) + if ( + model_type is None + or embedding_dimension is None + or pooling_mode is None + or normalize_result is None + ): + try: + if ( + model_type is None + and len(model._modules) >= 1 + and isinstance(model._modules["0"], Transformer) + ): + model_type = model._modules["0"].auto_model.__class__.__name__ + model_type = model_type.lower().rstrip("model") + if embedding_dimension is None: + embedding_dimension = model.get_sentence_embedding_dimension() + if ( + pooling_mode is None + and len(model._modules) >= 2 + and isinstance(model._modules["1"], Pooling) + ): + pooling_mode = model._modules["1"].get_pooling_mode_str().upper() + if normalize_result is None: + if len(model._modules) >= 3 and isinstance( + model._modules["2"], Normalize + ): + normalize_result = True + else: + normalize_result = False + except Exception as e: + raise Exception( + f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" + ) + + if all_config is None: if not os.path.exists(config_json_file_path): raise Exception( str( @@ -1045,37 +1074,6 @@ def make_model_config_json( config_content = json.load(f) if all_config is None: all_config = config_content - if model_type is None: - if "model_type" in config_content.keys(): - model_type = config_content["model_type"] - else: - print( - "Please check file or input model_type and embedding_dimension in the argument" - ) - raise Exception( - str( - "Cannot find model_type in config.json file" - + config_json_file_path - + ". Please check the config.son file in the path." - ) - ) - if embedding_dimension is None: - embedding_dimension_mapping_list = [ - "dim", - "hidden_size", - "d_model", - ] - for mapping_item in embedding_dimension_mapping_list: - if mapping_item in config_content.keys(): - embedding_dimension = config_content[mapping_item] - break - else: - print( - 'Cannot find "dim" or "hidden_size" or "d_model" in config.json file at ', - config_json_file_path, - ". Please add in the config file or input in the argument for embedding_dimension.", - ) - embedding_dimension = 768 except IOError: print( "Cannot open in config.json file at ", @@ -1093,63 +1091,12 @@ def make_model_config_json( "model_type": model_type, "embedding_dimension": embedding_dimension, "framework_type": "sentence_transformers", + "pooling_mode": pooling_mode, + "normalize_result": normalize_result, "all_config": json.dumps(all_config), }, } - if pooling_mode is not None: - model_config_content["model_config"]["pooling_mode"] = pooling_mode - else: - pooling_config_json_file_path = os.path.join( - folder_path, "1_Pooling", "config.json" - ) - if os.path.exists(pooling_config_json_file_path): - try: - with open(pooling_config_json_file_path) as f: - if verbose: - print( - "reading pooling config file from: " - + pooling_config_json_file_path - ) - pooling_config_content = json.load(f) - pooling_mode_mapping_dict = { - "pooling_mode_cls_token": "CLS", - "pooling_mode_mean_tokens": "MEAN", - "pooling_mode_max_tokens": "MAX", - "pooling_mode_mean_sqrt_len_tokens": "MEAN_SQRT_LEN", - } - for mapping_item in pooling_mode_mapping_dict: - if ( - mapping_item in pooling_config_content.keys() - and pooling_config_content[mapping_item] - ): - pooling_mode = pooling_mode_mapping_dict[mapping_item] - model_config_content["model_config"][ - "pooling_mode" - ] = pooling_mode - break - else: - print( - 'Cannot find "pooling_mode_[mode]_token(s)" with value true in config.json file at ', - pooling_config_json_file_path, - ". Please add in the pooling config file or input in the argument for pooling_mode.", - ) - - except IOError: - print( - "Cannot open in config.json file at ", - pooling_config_json_file_path, - ". Please check the config.json ", - "file in the path.", - ) - - if normalize_result is not None: - model_config_content["model_config"]["normalize_result"] = normalize_result - else: - normalize_result_json_file_path = os.path.join(folder_path, "2_Normalize") - if os.path.exists(normalize_result_json_file_path): - model_config_content["model_config"]["normalize_result"] = True - if verbose: print("generating ml-commons_model_config.json file...\n") print(model_config_content) diff --git a/tests/ml_commons/test_ml_commons_client.py b/tests/ml_commons/test_ml_commons_client.py index adad7bb7..73868ba4 100644 --- a/tests/ml_commons/test_ml_commons_client.py +++ b/tests/ml_commons/test_ml_commons_client.py @@ -303,6 +303,7 @@ def test_integration_model_train_register_full_cycle(): zip_file_name=MODEL_FILE_ZIP_NAME, num_epochs=1, overwrite=True, + verbose=True, ) # second generating the config file to create metadoc of the model in opensearch. test_model.make_model_config_json() diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 37beaa3e..ae16681f 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -107,7 +107,7 @@ def test_missing_files(): with pytest.raises(FileNotFoundError): test_model.read_queries(read_path="1234") - # test synthetic queries already exists in folder + # test synthetic queries already exists in folder with pytest.raises(Exception) as exc_info: temp_path = os.path.join( os.path.dirname(os.path.abspath("__file__")), @@ -123,7 +123,7 @@ def test_missing_files(): # test no tokenizer.json file with pytest.raises(Exception) as exc_info: - test_model.zip_model() + test_model.zip_model(verbose=True) assert "Cannot find tokenizer.json file" in str(exc_info.value) # test no model file @@ -137,7 +137,7 @@ def test_missing_files(): test_model3 = SentenceTransformerModel(folder_path=temp_path) test_model3.save_as_pt(sentences=["today is sunny"]) os.remove(os.path.join(temp_path, "msmarco-distilbert-base-tas-b.pt")) - test_model3.zip_model() + test_model3.zip_model(verbose=True) clean_test_folder(temp_path) assert "Cannot find model in the model path" in str(exc_info.value) @@ -187,7 +187,7 @@ def test_make_model_config_json_for_torch_script(): test_model5.save_as_pt(model_id=model_id, sentences=["today is sunny"]) model_config_path_torch = test_model5.make_model_config_json( - model_format="TORCH_SCRIPT" + model_format="TORCH_SCRIPT", verbose=True ) try: with open(model_config_path_torch) as json_file: @@ -372,85 +372,5 @@ def test_overwrite_fields_in_model_config(): clean_test_folder(TEST_FOLDER) -def test_missing_fields_in_config_json(): - model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" - expected_model_config_data = { - "embedding_dimension": 768, - "normalize_result": False, - } - - clean_test_folder(TEST_FOLDER) - test_model9 = SentenceTransformerModel( - folder_path=TEST_FOLDER, - model_id=model_id, - ) - - test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) - - pooling_json_file_path = os.path.join(TEST_FOLDER, "1_Pooling", "config.json") - try: - with open(pooling_json_file_path, "w") as f: - empty_dict = {} - json.dump(empty_dict, f) - except Exception as exec: - assert False, f"Modifying pooling json file raised an exception {exec}" - - config_json_file_path = os.path.join(TEST_FOLDER, "config.json") - try: - with open(config_json_file_path, "r") as f: - config_content = json.load(f) - embedding_dimension_mapping_list = [ - "dim", - "hidden_size", - "d_model", - ] - for mapping_item in embedding_dimension_mapping_list: - config_content.pop(mapping_item, None) - - with open(config_json_file_path, "w") as f: - json.dump(config_content, f) - except Exception as exec: - assert False, f"Modifying config json file raised an exception {exec}" - - model_config_path_torch = test_model9.make_model_config_json( - model_format="TORCH_SCRIPT", verbose=True - ) - try: - with open(model_config_path_torch) as json_file: - model_config_data_torch = json.load(json_file) - except Exception as exec: - assert ( - False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" - - assert ( - "name" in model_config_data_torch - and model_config_data_torch["name"] == model_id - ), "Missing or Wrong model name in torch script model config file" - assert ( - "model_format" in model_config_data_torch - and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) - assert ( - "model_config" in model_config_data_torch - ), "Missing 'model_config' in torch script model config file" - - for k, v in expected_model_config_data.items(): - assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v - ) or ( - k not in model_config_data_torch["model_config"] - and k == "normalize_result" - and not v - ), "make_model_config_json() does not generate an expected model config" - - assert ( - "pooling_mode" not in model_config_data_torch - ), "make_model_config_json() does not generate an expected model config" - - clean_test_folder(TEST_FOLDER) - - clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER)