diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml index be55b762..ce4bb36e 100644 --- a/.github/workflows/model_uploader.yml +++ b/.github/workflows/model_uploader.yml @@ -42,6 +42,14 @@ on: description: "(Optional) Description (Specify here if you want to overwrite the default model description)" required: false type: string + allow_overwrite: + description: "Allow the workflow to overwrite model in model hub" + required: true + type: choice + options: + - "NO" + - "YES" + jobs: # Step 2: Initiate workflow variable @@ -71,6 +79,7 @@ jobs: - Workflow Name: ${{ github.workflow }} - Workflow Run ID: ${{ github.run_id }} - Workflow Initiator: @${{ github.actor }} + - Aloow Overwrite: ${{ github.event.inputs.allow_overwrite }} ========= Workflow Input Information ========= - Model ID: ${{ github.event.inputs.model_id }} @@ -102,6 +111,7 @@ jobs: # Step 3: Check if the model already exists in the model hub checking-out-model-hub: needs: init-workflow-var + if: github.event.inputs.allow_overwrite == 'NO' runs-on: 'ubuntu-latest' permissions: id-token: write @@ -148,6 +158,7 @@ jobs: # Step 4: Trace the model, Verify the embeddings & Upload the model files as artifacts model-auto-tracing: needs: [init-workflow-var, checking-out-model-hub] + if: always() && needs.init-workflow-var.result == 'success' && (needs.checking-out-model-hub.result == 'success' || needs.checking-out-model-hub.result == 'skipped') name: model-auto-tracing runs-on: ubuntu-latest permissions: diff --git a/CHANGELOG.md b/CHANGELOG.md index 60ce037b..1a0858ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Update pretrained_models_all_versions.json (2023-09-08 13:14:07) by @dhrubo-os ([#277](https://github.com/opensearch-project/opensearch-py-ml/pull/277)) - Update model upload history - sentence-transformers/distiluse-base-multilingual-cased-v1 (v.1.0.1)(TORCH_SCRIPT) by @dhrubo-os ([#281](https://github.com/opensearch-project/opensearch-py-ml/pull/281)) - Update pretrained_models_all_versions.json (2023-09-14 10:28:41) by @dhrubo-os ([#282](https://github.com/opensearch-project/opensearch-py-ml/pull/282)) +Enable the model upload workflow to add model_content_size_in_bytes & model_content_hash_value to model config automatically @thanawan-atc ([#291](https://github.com/opensearch-project/opensearch-py-ml/pull/291)) ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) diff --git a/opensearch_py_ml/ml_commons/ml_common_utils.py b/opensearch_py_ml/ml_commons/ml_common_utils.py index b38e59b7..8ca5bab2 100644 --- a/opensearch_py_ml/ml_commons/ml_common_utils.py +++ b/opensearch_py_ml/ml_commons/ml_common_utils.py @@ -5,6 +5,8 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. +import hashlib + ML_BASE_URI = "/_plugins/_ml" MODEL_CHUNK_MAX_SIZE = 10_000_000 MODEL_MAX_SIZE = 4_000_000_000 @@ -22,3 +24,31 @@ FRAMEWORK_TYPE = "framework_type" MODEL_CONTENT_HASH_VALUE = "model_content_hash_value" MODEL_GROUP_ID = "model_group_id" + + +def _generate_model_content_hash_value(model_file_path: str) -> str: + """ + Generate sha1 hash value for the model zip file. + + Parameters + ---------- + :param model_file_path: file path of the model file + :type model_file_path: string + + + Returns + ------- + :return: sha256 hash + :rtype: string + + """ + + sha256 = hashlib.sha256() + with open(model_file_path, "rb") as file: + while True: + chunk = file.read(BUF_SIZE) + if not chunk: + break + sha256.update(chunk) + sha256_value = sha256.hexdigest() + return sha256_value diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py index 4f02a0e1..af7af2f6 100644 --- a/opensearch_py_ml/ml_commons/model_uploader.py +++ b/opensearch_py_ml/ml_commons/model_uploader.py @@ -5,7 +5,6 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. -import hashlib import json import os from math import ceil @@ -14,7 +13,6 @@ from opensearchpy import OpenSearch from opensearch_py_ml.ml_commons.ml_common_utils import ( - BUF_SIZE, EMBEDDING_DIMENSION, FRAMEWORK_TYPE, META_API_ENDPOINT, @@ -30,6 +28,7 @@ MODEL_TYPE, MODEL_VERSION_FIELD, TOTAL_CHUNKS_FIELD, + _generate_model_content_hash_value, ) @@ -85,12 +84,8 @@ def _register_model( model_content_size_in_bytes = os.stat(model_path).st_size total_num_chunks: int = ceil(model_content_size_in_bytes / MODEL_CHUNK_MAX_SIZE) - # we are generating the sha1 hash for the model zip file - hash_val_model_file = self._generate_hash(model_path) - if isVerbose: print("Total number of chunks", total_num_chunks) - print("Sha1 value of the model file: ", hash_val_model_file) model_meta_json_file = open(model_meta_path) @@ -98,8 +93,18 @@ def _register_model( model_meta_json_file ) model_meta_json[TOTAL_CHUNKS_FIELD] = total_num_chunks - model_meta_json[MODEL_CONTENT_SIZE_IN_BYTES_FIELD] = model_content_size_in_bytes - model_meta_json[MODEL_CONTENT_HASH_VALUE] = hash_val_model_file + + if MODEL_CONTENT_SIZE_IN_BYTES_FIELD not in model_meta_json: + model_meta_json[ + MODEL_CONTENT_SIZE_IN_BYTES_FIELD + ] = model_content_size_in_bytes + if MODEL_CONTENT_HASH_VALUE not in model_meta_json: + # Generate the sha1 hash for the model zip file + hash_val_model_file = _generate_model_content_hash_value(model_path) + model_meta_json[MODEL_CONTENT_HASH_VALUE] = hash_val_model_file + if isVerbose: + print("Sha1 value of the model file: ", hash_val_model_file) + model_meta_json[MODEL_GROUP_ID] = model_group_id if self._check_mandatory_field(model_meta_json): @@ -189,30 +194,3 @@ def _check_mandatory_field(self, model_meta: dict) -> bool: return True else: raise ValueError("Model metadata can't be empty") - - def _generate_hash(self, model_file_path: str) -> str: - """ - Generate sha1 hash value for the model zip file. - - Parameters - ---------- - :param model_file_path: file path of the model file - :type model_file_path: string - - - Returns - ------- - :return: sha256 hash - :rtype: string - - """ - - sha256 = hashlib.sha256() - with open(model_file_path, "rb") as file: - while True: - chunk = file.read(BUF_SIZE) - if not chunk: - break - sha256.update(chunk) - sha256_value = sha256.hexdigest() - return sha256_value diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 3fbb334d..db5d5a22 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -21,6 +21,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +import requests import torch import yaml from accelerate import Accelerator, notebook_launcher @@ -32,6 +33,12 @@ from transformers import TrainingArguments, get_linear_schedule_with_warmup from transformers.convert_graph_to_onnx import convert +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) + +LICENSE_URL = "https://github.com/opensearch-project/opensearch-py-ml/raw/main/LICENSE" + class SentenceTransformerModel: """ @@ -48,7 +55,7 @@ def __init__( overwrite: bool = False, ) -> None: """ - Description: Initiate a sentence transformer model class object. The model id will be used to download + Initiate a sentence transformer model class object. The model id will be used to download pretrained model from the hugging-face and served as the default name for model files, and the folder_path will be the default location to store files generated in the following functions @@ -86,6 +93,8 @@ def __init__( ) self.model_id = model_id + self.torch_script_zip_file_path = None + self.onnx_zip_file_path = None def train( self, @@ -372,7 +381,6 @@ def train_model( percentile: float = 95, ): """ - Description: Takes in training data and a sentence transformer url to train a custom semantic search model :param train_examples: @@ -633,16 +641,32 @@ def train_model( print("Model saved to path: " + self.folder_path + "\n") return traced_cpu + def _add_apache_license_to_model_zip_file(self, model_zip_file_path: str): + """ + Add Apache-2.0 license file to the model zip file at model_zip_file_path + + :param model_zip_file_path: + Path to the model zip file + :type model_zip_file_path: string + :return: no return value expected + :rtype: None + """ + r = requests.get(LICENSE_URL) + assert r.status_code == 200, "Failed to add license file to the model zip file" + + with ZipFile(str(model_zip_file_path), "a") as zipObj: + zipObj.writestr("LICENSE", r.content) + def zip_model( self, model_path: str = None, model_name: str = None, zip_file_name: str = None, + add_apache_license: bool = False, verbose: bool = False, ) -> None: """ - Description: - zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster + Zip the model file and its tokenizer.json file to prepare to upload to the OpenSearch cluster :param model_path: Optional, path to find the model file, if None, default as concatenate model_id and @@ -654,6 +678,9 @@ def zip_model( :param zip_file_name: str =None Optional, file name for zip file. if None, default as concatenate model_id and '.zip' :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string :param verbose: optional, use to print more logs. Default as false :type verbose: bool @@ -672,8 +699,9 @@ def zip_model( print("model path is: ", model_path) if zip_file_name is None: - zip_file_name = str(model_name + ".zip") + zip_file_name = str(self.model_id.split("/")[-1] + ".zip") + zip_file_path = os.path.join(self.folder_path, zip_file_name) zip_file_name_without_extension = zip_file_name.split(".")[0] if verbose: @@ -682,8 +710,6 @@ def zip_model( tokenizer_json_path = os.path.join(self.folder_path, "tokenizer.json") print("tokenizer_json_path: ", tokenizer_json_path) - zip_file_path = os.path.join(self.folder_path, zip_file_name) - if not os.path.exists(tokenizer_json_path): raise Exception( "Cannot find tokenizer.json file, please check at " @@ -695,12 +721,15 @@ def zip_model( ) # Create a ZipFile Object - with ZipFile(zip_file_path, "w") as zipObj: - zipObj.write(model_path, zip_file_name_without_extension + "/" + model_name) + with ZipFile(str(zip_file_path), "w") as zipObj: + zipObj.write(model_path, arcname=str(model_name)) zipObj.write( tokenizer_json_path, - zip_file_name_without_extension + "/" + "tokenizer.json", + arcname="tokenizer.json", ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + print("zip file is saved to " + zip_file_path + "\n") def _fill_null_truncation_field( @@ -709,7 +738,6 @@ def _fill_null_truncation_field( max_length: int, ) -> None: """ - Description: Fill truncation field in tokenizer.json when it is null :param save_json_folder_path: @@ -742,9 +770,10 @@ def save_as_pt( save_json_folder_path: str = None, model_output_path: str = None, zip_file_name: str = None, + add_apache_license: bool = False, ) -> str: """ - download sentence transformer model directly from huggingface, convert model to torch script format, + Download sentence transformer model directly from huggingface, convert model to torch script format, zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster :param sentences: @@ -770,6 +799,9 @@ def save_as_pt( Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id and add the extension with ".zip" :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ @@ -826,7 +858,7 @@ def save_as_pt( torch.jit.save(compiled_model, model_path) print("model file is saved to ", model_path) - # zip model file along with tokenizer.json as output + # zip model file along with tokenizer.json (and license file) as output with ZipFile(str(zip_file_path), "w") as zipObj: zipObj.write( model_path, @@ -836,6 +868,10 @@ def save_as_pt( os.path.join(save_json_folder_path, "tokenizer.json"), arcname="tokenizer.json", ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") return zip_file_path @@ -846,9 +882,10 @@ def save_as_onnx( save_json_folder_path: str = None, model_output_path: str = None, zip_file_name: str = None, + add_apache_license: bool = False, ) -> str: """ - download sentence transformer model directly from huggingface, convert model to onnx format, + Download sentence transformer model directly from huggingface, convert model to onnx format, zip the model file and its tokenizer.json file to prepare to upload to the Open Search cluster :param model_id: @@ -871,6 +908,9 @@ def save_as_onnx( Optional, file name for zip file. e.g, "sample_model.zip". If None, default takes the model_id and add the extension with ".zip" :type zip_file_name: string + :param add_apache_license: + Optional, whether to add a Apache-2.0 license file to model zip file + :type add_apache_license: string :return: model zip file path. The file path where the zip file is being saved :rtype: string """ @@ -916,7 +956,7 @@ def save_as_onnx( print("model file is saved to ", model_path) - # zip model file along with tokenizer.json as output + # zip model file along with tokenizer.json (and license file) as output with ZipFile(str(zip_file_path), "w") as zipObj: zipObj.write( model_path, @@ -926,6 +966,10 @@ def save_as_onnx( os.path.join(save_json_folder_path, "tokenizer.json"), arcname="tokenizer.json", ) + if add_apache_license: + self._add_apache_license_to_model_zip_file(zip_file_path) + + self.onnx_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") return zip_file_path @@ -937,7 +981,7 @@ def set_up_accelerate_config( verbose: bool = False, ) -> None: """ - get default config setting based on the number of GPU on the machine + Get default config setting based on the number of GPU on the machine if users require other configs, users can run !acclerate config for more options :param compute_environment: @@ -1099,6 +1143,7 @@ def make_model_config_json( model_name: str = None, version_number: str = 1, model_format: str = "TORCH_SCRIPT", + model_zip_file_path: str = None, embedding_dimension: int = None, pooling_mode: str = None, normalize_result: bool = None, @@ -1108,15 +1153,21 @@ def make_model_config_json( verbose: bool = False, ) -> str: """ - parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. If all required - fields are given by users, use the given parameters and will skip reading the config.json + Parse from config.json file of pre-trained hugging-face model to generate a ml-commons_model_config.json file. + If all required fields are given by users, use the given parameters and will skip reading the config.json + :param model_name: Optional, The name of the model. If None, default is model id, for example, 'sentence-transformers/msmarco-distilbert-base-tas-b' :type model_name: string :param model_format: - Optional, The format of the model. Default is "TORCH_SCRIPT". + Optional, the format of the model. Default is "TORCH_SCRIPT". :type model_format: string + :param model_zip_file_path: + Optional, path to the model zip file. Default is the zip file path used in save_as_pt or save_as_onnx + depending on model_format. This zip file is used to compute model_content_size_in_bytes and + model_content_hash_value. + :type model_zip_file_path: string :param version_number: Optional, The version number of the model. Default is 1 :type version_number: string @@ -1239,16 +1290,34 @@ def make_model_config_json( }, } + if model_zip_file_path is None: + model_zip_file_path = ( + self.torch_script_zip_file_path + if model_format == "TORCH_SCRIPT" + else self.onnx_zip_file_path + ) + if model_zip_file_path is None: + print( + "The model configuration JSON file currently lacks the 'model_content_size_in_bytes' and 'model_content_hash_value' fields. You can include these fields by specifying the 'model_zip_file_path' parameter. Failure to do so may result in the model registration process encountering issues." + ) + else: + model_config_content["model_content_size_in_bytes"] = os.stat( + model_zip_file_path + ).st_size + model_config_content[ + "model_content_hash_value" + ] = _generate_model_content_hash_value(model_zip_file_path) + if verbose: print("generating ml-commons_model_config.json file...\n") - print(model_config_content) + print(json.dumps(model_config_content, indent=4)) model_config_file_path = os.path.join( folder_path, "ml-commons_model_config.json" ) os.makedirs(os.path.dirname(model_config_file_path), exist_ok=True) with open(model_config_file_path, "w") as file: - json.dump(model_config_content, file) + json.dump(model_config_content, file, indent=4) print( "ml-commons_model_config.json file is saved at : ", model_config_file_path ) diff --git a/tests/ml_commons/test_model_uploader.py b/tests/ml_commons/test_model_uploader.py index 78be2209..31e35932 100644 --- a/tests/ml_commons/test_model_uploader.py +++ b/tests/ml_commons/test_model_uploader.py @@ -8,6 +8,9 @@ import pytest from opensearchpy.client import OpenSearch +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) from opensearch_py_ml.ml_commons.model_uploader import ModelUploader from tests import FLIGHTS_SMALL_FILE_NAME @@ -88,8 +91,8 @@ def test_check_mandatory_field(): model_uploader._check_mandatory_field(model_meta) -def test_generate_hash(): +def test_generate_model_content_hash_value(): assert ( "18521f420cf85149025b75df329689c416be0ce3fc78b2afdfdf177654b77b34" - == model_uploader._generate_hash(FLIGHTS_SMALL_FILE_NAME) + == _generate_model_content_hash_value(FLIGHTS_SMALL_FILE_NAME) ) diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index 9b0fe30a..c9c9046b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -8,6 +8,7 @@ import json import os import shutil +from zipfile import ZipFile import pytest @@ -44,6 +45,69 @@ def clean_test_folder(TEST_FOLDER): shutil.rmtree(TEST_FOLDER) +def compare_model_config( + model_config_path, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=None, +): + try: + with open(model_config_path) as json_file: + model_config_data = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" + + assert ( + "name" in model_config_data and model_config_data["name"] == model_id + ), f"Missing or Wrong model name in {model_format} model config file" + + assert ( + "model_format" in model_config_data + and model_config_data["model_format"] == model_format + ), f"Missing or Wrong model_format in {model_format} model config file" + + if expected_model_description is not None: + assert ( + "description" in model_config_data + and model_config_data["description"] == expected_model_description + ), f"Missing or Wrong model description in {model_format} model config file'" + + if expected_model_config_data is not None: + assert ( + "model_config" in model_config_data + ), f"Missing 'model_config' in {model_format} model config file" + + if expected_model_config_data is not None: + for k, v in expected_model_config_data.items(): + assert ( + k in model_config_data["model_config"] + and model_config_data["model_config"][k] == v + ) or ( + k not in model_config_data["model_config"] + and k == "normalize_result" + and not v + ) + + assert ( + "model_content_size_in_bytes" in model_config_data + ), f"Missing 'model_content_size_in_bytes' in {model_format} model config file" + + assert ( + "model_content_hash_value" in model_config_data + ), f"Missing 'model_content_hash_value' in {model_format} model config file" + + +def compare_model_zip_file(zip_file_path, expected_filenames, model_format): + with ZipFile(zip_file_path, "r") as f: + filenames = set(f.namelist()) + assert ( + filenames == expected_filenames + ), f"The content in the {model_format} model zip file does not match the expected content: {filenames} != {expected_filenames}" + + clean_test_folder(TEST_FOLDER) test_model = SentenceTransformerModel(folder_path=TEST_FOLDER) @@ -173,6 +237,8 @@ def test_save_as_onnx(): def test_make_model_config_json_for_torch_script(): model_id = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", @@ -189,46 +255,22 @@ def test_make_model_config_json_for_torch_script(): model_config_path_torch = test_model5.make_model_config_json( model_format="TORCH_SCRIPT", verbose=True ) - try: - with open(model_config_path_torch) as json_file: - model_config_data_torch = json.load(json_file) - except Exception as exec: - assert ( - False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" - - assert ( - "name" in model_config_data_torch - and model_config_data_torch["name"] == model_id - ), "Missing or Wrong model name in torch script model config file" - assert ( - "model_format" in model_config_data_torch - and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ), "Missing or Wrong model_format in torch script model config file" - assert ( - "description" in model_config_data_torch - and model_config_data_torch["description"] - == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." - ), "Missing or Wrong model description in onnx model config file'" - assert ( - "model_config" in model_config_data_torch - ), "Missing 'model_config' in torch script model config file" - for k, v in expected_model_config_data.items(): - assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v - ) or ( - k not in model_config_data_torch["model_config"] - and k == "normalize_result" - and not v - ) + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) clean_test_folder(TEST_FOLDER) def test_make_model_config_json_for_onnx(): model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" + model_format = "ONNX" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." expected_model_config_data = { "embedding_dimension": 384, "pooling_mode": "MEAN", @@ -243,45 +285,21 @@ def test_make_model_config_json_for_onnx(): test_model6.save_as_onnx(model_id=model_id) model_config_path_onnx = test_model6.make_model_config_json(model_format="ONNX") - try: - with open(model_config_path_onnx) as json_file: - model_config_data_onnx = json.load(json_file) - except Exception as exec: - assert ( - False - ), f"Creating model config file for tracing in onnx raised an exception {exec}" - - assert ( - "name" in model_config_data_onnx and model_config_data_onnx["name"] == model_id - ), "Missing or Wrong model name in onnx model config file" - assert ( - "model_format" in model_config_data_onnx - and model_config_data_onnx["model_format"] == "ONNX" - ), "Missing or Wrong model_format in onnx model config file" - assert ( - "description" in model_config_data_onnx - and model_config_data_onnx["description"] - == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." - ), "Missing or Wrong model description in onnx model config file" - assert ( - "model_config" in model_config_data_onnx - ), "Missing 'model_config' in onnx model config file" - for k, v in expected_model_config_data.items(): - assert ( - k in model_config_data_onnx["model_config"] - and model_config_data_onnx["model_config"][k] == v - ) or ( - k not in model_config_data_onnx["model_config"] - and k == "normalize_result" - and not v - ) + compare_model_config( + model_config_path_onnx, + model_id, + model_format, + expected_model_description=expected_model_description, + expected_model_config_data=expected_model_config_data, + ) clean_test_folder(TEST_FOLDER) def test_overwrite_fields_in_model_config(): model_id = "sentence-transformers/all-distilroberta-v1" + model_format = "TORCH_SCRIPT" expected_model_config_data = { "embedding_dimension": 768, "pooling_mode": "MEAN", @@ -305,35 +323,13 @@ def test_overwrite_fields_in_model_config(): model_format="TORCH_SCRIPT" ) - try: - with open(model_config_path_torch) as json_file: - model_config_data_torch = json.load(json_file) - except Exception as exec: - assert ( - False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" - - assert ( - "name" in model_config_data_torch - and model_config_data_torch["name"] == model_id - ), "Missing or Wrong model name in torch script model config file" - assert ( - "model_format" in model_config_data_torch - and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ), "Missing or Wrong model_format in onnx model config file" - assert ( - "model_config" in model_config_data_torch - ), "Missing 'model_config' in torch script model config file" - - for k, v in expected_model_config_data.items(): - assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v - ) or ( - k not in model_config_data_torch["model_config"] - and k == "normalize_result" - and not v - ) + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=expected_model_config_data, + ) clean_test_folder(TEST_FOLDER) test_model8 = SentenceTransformerModel( @@ -349,41 +345,22 @@ def test_overwrite_fields_in_model_config(): normalize_result=overwritten_model_config_data["normalize_result"], ) - try: - with open(model_config_path_torch) as json_file: - model_config_data_torch = json.load(json_file) - except Exception as exec: - assert ( - False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" - - assert ( - "name" in model_config_data_torch - and model_config_data_torch["name"] == model_id - ), "Missing or Wrong model name in torch script model config file" - assert ( - "model_format" in model_config_data_torch - and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ), "Missing or Wrong model_format in torch script model config file" - assert ( - "model_config" in model_config_data_torch - ), "Missing 'model_config' in torch script model config file" - - for k, v in overwritten_model_config_data.items(): - assert ( - k in model_config_data_torch["model_config"] - and model_config_data_torch["model_config"][k] == v - ) or ( - k not in model_config_data_torch["model_config"] - and k == "normalize_result" - and not v - ) + compare_model_config( + model_config_path_torch, + model_id, + model_format, + expected_model_description=None, + expected_model_config_data=overwritten_model_config_data, + ) clean_test_folder(TEST_FOLDER) def test_missing_readme_md_file(): model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space." + clean_test_folder(TEST_FOLDER) test_model9 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -397,7 +374,7 @@ def test_missing_readme_md_file(): ) os.remove(temp_path) model_config_path_torch = test_model9.make_model_config_json( - model_format="TORCH_SCRIPT" + model_format=model_format ) try: with open(model_config_path_torch) as json_file: @@ -405,12 +382,11 @@ def test_missing_readme_md_file(): except Exception as exec: assert ( False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" assert ( "description" in model_config_data_torch - and model_config_data_torch["description"] - == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space." + and model_config_data_torch["description"] == expected_model_description ), "Should use default model description when README.md file is missing" clean_test_folder(TEST_FOLDER) @@ -418,6 +394,9 @@ def test_missing_readme_md_file(): def test_missing_expected_description_in_readme_file(): model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + clean_test_folder(TEST_FOLDER) test_model10 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -432,7 +411,7 @@ def test_missing_expected_description_in_readme_file(): with open(temp_path, "w") as f: f.write("No model description here") model_config_path_torch = test_model10.make_model_config_json( - model_format="TORCH_SCRIPT" + model_format=model_format ) try: with open(model_config_path_torch) as json_file: @@ -440,12 +419,11 @@ def test_missing_expected_description_in_readme_file(): except Exception as exec: assert ( False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" assert ( "description" in model_config_data_torch - and model_config_data_torch["description"] - == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + and model_config_data_torch["description"] == expected_model_description ), "Should use default model description when description is missing from README.md" clean_test_folder(TEST_FOLDER) @@ -453,6 +431,9 @@ def test_missing_expected_description_in_readme_file(): def test_overwrite_description(): model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_format = "TORCH_SCRIPT" + expected_model_description = "Expected Description" + clean_test_folder(TEST_FOLDER) test_model11 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -461,7 +442,7 @@ def test_overwrite_description(): test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) model_config_path_torch = test_model11.make_model_config_json( - model_format="TORCH_SCRIPT", description="Expected Description" + model_format=model_format, description=expected_model_description ) try: with open(model_config_path_torch) as json_file: @@ -469,11 +450,11 @@ def test_overwrite_description(): except Exception as exec: assert ( False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" assert ( "description" in model_config_data_torch - and model_config_data_torch["description"] == "Expected Description" + and model_config_data_torch["description"] == expected_model_description ), "Cannot overwrite description in model config file" clean_test_folder(TEST_FOLDER) @@ -481,6 +462,9 @@ def test_overwrite_description(): def test_long_description(): model_id = "sentence-transformers/gtr-t5-base" + model_format = "TORCH_SCRIPT" + expected_model_description = "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + clean_test_folder(TEST_FOLDER) test_model12 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -489,7 +473,7 @@ def test_long_description(): test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) model_config_path_torch = test_model12.make_model_config_json( - model_format="TORCH_SCRIPT" + model_format=model_format ) try: with open(model_config_path_torch) as json_file: @@ -497,13 +481,12 @@ def test_long_description(): except Exception as exec: assert ( False - ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + ), f"Creating model config file for tracing in {model_format} raised an exception {exec}" assert ( "description" in model_config_data_torch - and model_config_data_torch["description"] - == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." - ), "Missing or Wrong model description in torch_script model config file" + and model_config_data_torch["description"] == expected_model_description + ), "Missing or Wrong model description in model config file when the description is longer than normally." clean_test_folder(TEST_FOLDER) @@ -598,5 +581,82 @@ def test_undefined_model_max_length_in_tokenizer_for_onnx(): clean_test_folder(TEST_FOLDER) +def test_save_as_pt_with_license(): + model_id = "sentence-transformers/all-MiniLM-L6-v2" + model_format = "TORCH_SCRIPT" + torch_script_zip_file_path = os.path.join(TEST_FOLDER, "all-MiniLM-L6-v2.zip") + torch_script_expected_filenames = { + "all-MiniLM-L6-v2.pt", + "tokenizer.json", + "LICENSE", + } + + clean_test_folder(TEST_FOLDER) + test_model15 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model15.save_as_pt( + model_id=model_id, + sentences=["today is sunny"], + add_apache_license=True, + ) + + compare_model_zip_file( + torch_script_zip_file_path, torch_script_expected_filenames, model_format + ) + + clean_test_folder(TEST_FOLDER) + + +def test_save_as_onnx_with_license(): + model_id = "sentence-transformers/all-distilroberta-v1" + model_format = "ONNX" + onnx_zip_file_path = os.path.join(TEST_FOLDER, "all-distilroberta-v1.zip") + onnx_expected_filenames = {"all-distilroberta-v1.onnx", "tokenizer.json", "LICENSE"} + + clean_test_folder(TEST_FOLDER) + test_model16 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model16.save_as_onnx(model_id=model_id, add_apache_license=True) + + compare_model_zip_file(onnx_zip_file_path, onnx_expected_filenames, model_format) + + clean_test_folder(TEST_FOLDER) + + +def test_zip_model_with_license(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + model_format = "TORCH_SCRIPT" + zip_file_path = os.path.join(TEST_FOLDER, "msmarco-distilbert-base-tas-b.zip") + expected_filenames_wo_license = { + "msmarco-distilbert-base-tas-b.pt", + "tokenizer.json", + } + expected_filenames_with_license = { + "msmarco-distilbert-base-tas-b.pt", + "tokenizer.json", + "LICENSE", + } + + clean_test_folder(TEST_FOLDER) + test_model17 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model17.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + compare_model_zip_file(zip_file_path, expected_filenames_wo_license, model_format) + + test_model17.zip_model(add_apache_license=True) + compare_model_zip_file(zip_file_path, expected_filenames_with_license, model_format) + + clean_test_folder(TEST_FOLDER) + + clean_test_folder(TEST_FOLDER) clean_test_folder(TESTDATA_UNZIP_FOLDER) diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 3ffd3e56..3794087a 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -16,7 +16,6 @@ import sys import warnings from typing import List, Optional, Tuple -from zipfile import ZipFile import numpy as np from mdutils.fileutils import MarkDownFile @@ -30,7 +29,6 @@ ROOT_DIR = os.path.join(THIS_DIR, "../..") sys.path.append(ROOT_DIR) -LICENSE_PATH = "LICENSE" from opensearch_py_ml.ml_commons import MLCommonClient from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel from tests import OPENSEARCH_TEST_CLIENT @@ -49,7 +47,8 @@ DESCRIPTION_VAR_FILE = "description.txt" TEST_SENTENCES = [ "First test sentence", - "This is a very long sentence used for testing model embedding outputs.", + "This is another sentence used for testing model embedding outputs.", + "OpenSearch is a scalable, flexible, and extensible open-source software suite for search, analytics, and observability applications licensed under Apache 2.0. Powered by Apache Lucene and driven by the OpenSearch Project community, OpenSearch offers a vendor-agnostic toolset you can use to build secure, high-performance, cost-efficient applications. Use OpenSearch as an end-to-end solution or connect it with your preferred open-source tools or partner projects.", ] RTOL_TEST = 1e-03 ATOL_TEST = 1e-05 @@ -77,10 +76,10 @@ def verify_license_in_md_file() -> bool: return False metadata_info = readme_data[start + 3 : end] if "apache-2.0" in metadata_info.lower(): - print("\nFound apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + print("\nFound apache-2.0 license at " + TEMP_MODEL_PATH + "/README.md") return True else: - print("\nDid not find apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + print("\nDid not find apache-2.0 license at " + TEMP_MODEL_PATH + "/README.md") return False @@ -134,16 +133,21 @@ def trace_sentence_transformer_model( try: if model_format == TORCH_SCRIPT_FORMAT: model_path = pre_trained_model.save_as_pt( - model_id=model_id, sentences=TEST_SENTENCES + model_id=model_id, + sentences=TEST_SENTENCES, + add_apache_license=True, ) else: - model_path = pre_trained_model.save_as_onnx(model_id=model_id) + model_path = pre_trained_model.save_as_onnx( + model_id=model_id, add_apache_license=True + ) except Exception as e: assert False, f"Raised Exception during saving model as {model_format}: {e}" # 3.) Create a model config json file + model_config_path = None try: - pre_trained_model.make_model_config_json( + model_config_path = pre_trained_model.make_model_config_json( version_number=model_version, model_format=model_format, embedding_dimension=embedding_dimension, @@ -155,9 +159,14 @@ def trace_sentence_transformer_model( False ), f"Raised Exception during making model config file for {model_format} model: {e}" - # 4.) Return model_path & model_config_path for model registration - model_config_path = folder_path + MODEL_CONFIG_FILE_NAME + # 4.) Preview model config + print(f"\n+++++ {model_format} Model Config +++++\n") + with open(model_config_path, "r") as f: + model_config = json.load(f) + print(json.dumps(model_config, indent=4)) + print("\n+++++++++++++++++++++++++++++++++++++++\n") + # 5.) Return model_path & model_config_path for model registration return model_path, model_config_path @@ -215,7 +224,7 @@ def register_and_deploy_sentence_transformer_model( # 3.) Generate embeddings try: embedding_output = ml_client.generate_embedding(model_id, TEST_SENTENCES) - assert len(embedding_output.get("inference_results")) == 2 + assert len(embedding_output.get("inference_results")) == len(TEST_SENTENCES) embedding_data = [ embedding_output["inference_results"][i]["output"][0]["data"] for i in range(len(TEST_SENTENCES)) @@ -322,8 +331,6 @@ def prepare_files_for_uploading( f"{model_type}_{model_name}-{model_version}-{model_format}.zip" ) dst_model_path = dst_model_dir + "/" + dst_model_filename - with ZipFile(src_model_path, "a") as zipObj: - zipObj.write(filename=LICENSE_PATH, arcname="LICENSE") shutil.copy(src_model_path, dst_model_path) print(f"\nCopied {src_model_path} to {dst_model_path}")