diff --git a/opensearch_py_ml/ml_commons/ml_common_utils.py b/opensearch_py_ml/ml_commons/ml_common_utils.py index b38e59b7..d8cf7b80 100644 --- a/opensearch_py_ml/ml_commons/ml_common_utils.py +++ b/opensearch_py_ml/ml_commons/ml_common_utils.py @@ -5,6 +5,8 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. +import hashlib + ML_BASE_URI = "/_plugins/_ml" MODEL_CHUNK_MAX_SIZE = 10_000_000 MODEL_MAX_SIZE = 4_000_000_000 @@ -22,3 +24,31 @@ FRAMEWORK_TYPE = "framework_type" MODEL_CONTENT_HASH_VALUE = "model_content_hash_value" MODEL_GROUP_ID = "model_group_id" + + +def _generate_model_content_hash_value(self, model_file_path: str) -> str: + """ + Generate sha1 hash value for the model zip file. + + Parameters + ---------- + :param model_file_path: file path of the model file + :type model_file_path: string + + + Returns + ------- + :return: sha256 hash + :rtype: string + + """ + + sha256 = hashlib.sha256() + with open(model_file_path, "rb") as file: + while True: + chunk = file.read(BUF_SIZE) + if not chunk: + break + sha256.update(chunk) + sha256_value = sha256.hexdigest() + return sha256_value diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py index 4f02a0e1..eaa7af6a 100644 --- a/opensearch_py_ml/ml_commons/model_uploader.py +++ b/opensearch_py_ml/ml_commons/model_uploader.py @@ -5,7 +5,6 @@ # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. -import hashlib import json import os from math import ceil @@ -14,7 +13,6 @@ from opensearchpy import OpenSearch from opensearch_py_ml.ml_commons.ml_common_utils import ( - BUF_SIZE, EMBEDDING_DIMENSION, FRAMEWORK_TYPE, META_API_ENDPOINT, @@ -30,6 +28,7 @@ MODEL_TYPE, MODEL_VERSION_FIELD, TOTAL_CHUNKS_FIELD, + _generate_model_content_hash_value, ) @@ -86,7 +85,7 @@ def _register_model( total_num_chunks: int = ceil(model_content_size_in_bytes / MODEL_CHUNK_MAX_SIZE) # we are generating the sha1 hash for the model zip file - hash_val_model_file = self._generate_hash(model_path) + hash_val_model_file = _generate_model_content_hash_value(model_path) if isVerbose: print("Total number of chunks", total_num_chunks) @@ -189,30 +188,3 @@ def _check_mandatory_field(self, model_meta: dict) -> bool: return True else: raise ValueError("Model metadata can't be empty") - - def _generate_hash(self, model_file_path: str) -> str: - """ - Generate sha1 hash value for the model zip file. - - Parameters - ---------- - :param model_file_path: file path of the model file - :type model_file_path: string - - - Returns - ------- - :return: sha256 hash - :rtype: string - - """ - - sha256 = hashlib.sha256() - with open(model_file_path, "rb") as file: - while True: - chunk = file.read(BUF_SIZE) - if not chunk: - break - sha256.update(chunk) - sha256_value = sha256.hexdigest() - return sha256_value diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 3fbb334d..f20d3efc 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -32,6 +32,10 @@ from transformers import TrainingArguments, get_linear_schedule_with_warmup from transformers.convert_graph_to_onnx import convert +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) + class SentenceTransformerModel: """ @@ -86,6 +90,8 @@ def __init__( ) self.model_id = model_id + self.torch_script_zip_file_path = None + self.onnx_zip_file_path = None def train( self, @@ -701,6 +707,7 @@ def zip_model( tokenizer_json_path, zip_file_name_without_extension + "/" + "tokenizer.json", ) + print("zip file is saved to " + zip_file_path + "\n") def _fill_null_truncation_field( @@ -836,6 +843,7 @@ def save_as_pt( os.path.join(save_json_folder_path, "tokenizer.json"), arcname="tokenizer.json", ) + self.torch_script_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") return zip_file_path @@ -926,6 +934,8 @@ def save_as_onnx( os.path.join(save_json_folder_path, "tokenizer.json"), arcname="tokenizer.json", ) + + self.onnx_zip_file_path = zip_file_path print("zip file is saved to ", zip_file_path, "\n") return zip_file_path @@ -1099,6 +1109,7 @@ def make_model_config_json( model_name: str = None, version_number: str = 1, model_format: str = "TORCH_SCRIPT", + model_zip_file_path: str = None, embedding_dimension: int = None, pooling_mode: str = None, normalize_result: bool = None, @@ -1239,6 +1250,24 @@ def make_model_config_json( }, } + if model_zip_file_path is None: + model_zip_file_path = ( + self.torch_script_zip_file_path + if model_format == "TORCH_SCRIPT" + else self.onnx_zip_file_path + ) + if model_zip_file_path is None: + print( + "Set model_zip_file_path parameter to add the field 'model_content_size_in_bytes' and 'model_content_hash_value' to model config json file." + ) + else: + model_config_content["model_content_size_in_bytes"] = os.stat( + model_zip_file_path + ).st_size + model_config_content[ + "model_content_hash_value" + ] = _generate_model_content_hash_value(model_zip_file_path) + if verbose: print("generating ml-commons_model_config.json file...\n") print(model_config_content) diff --git a/tests/ml_commons/test_model_uploader.py b/tests/ml_commons/test_model_uploader.py index 78be2209..31e35932 100644 --- a/tests/ml_commons/test_model_uploader.py +++ b/tests/ml_commons/test_model_uploader.py @@ -8,6 +8,9 @@ import pytest from opensearchpy.client import OpenSearch +from opensearch_py_ml.ml_commons.ml_common_utils import ( + _generate_model_content_hash_value, +) from opensearch_py_ml.ml_commons.model_uploader import ModelUploader from tests import FLIGHTS_SMALL_FILE_NAME @@ -88,8 +91,8 @@ def test_check_mandatory_field(): model_uploader._check_mandatory_field(model_meta) -def test_generate_hash(): +def test_generate_model_content_hash_value(): assert ( "18521f420cf85149025b75df329689c416be0ce3fc78b2afdfdf177654b77b34" - == model_uploader._generate_hash(FLIGHTS_SMALL_FILE_NAME) + == _generate_model_content_hash_value(FLIGHTS_SMALL_FILE_NAME) )