Skip to content

Commit

Permalink
Preliminary work: Add generate_hash to utils
Browse files Browse the repository at this point in the history
Signed-off-by: thanawan-atc <[email protected]>
  • Loading branch information
thanawan-atc committed Sep 15, 2023
1 parent 5441861 commit 1839e2e
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 32 deletions.
30 changes: 30 additions & 0 deletions opensearch_py_ml/ml_commons/ml_common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

import hashlib

ML_BASE_URI = "/_plugins/_ml"
MODEL_CHUNK_MAX_SIZE = 10_000_000
MODEL_MAX_SIZE = 4_000_000_000
Expand All @@ -22,3 +24,31 @@
FRAMEWORK_TYPE = "framework_type"
MODEL_CONTENT_HASH_VALUE = "model_content_hash_value"
MODEL_GROUP_ID = "model_group_id"


def _generate_model_content_hash_value(self, model_file_path: str) -> str:
"""
Generate sha1 hash value for the model zip file.
Parameters
----------
:param model_file_path: file path of the model file
:type model_file_path: string
Returns
-------
:return: sha256 hash
:rtype: string
"""

sha256 = hashlib.sha256()
with open(model_file_path, "rb") as file:
while True:
chunk = file.read(BUF_SIZE)
if not chunk:
break
sha256.update(chunk)
sha256_value = sha256.hexdigest()
return sha256_value
32 changes: 2 additions & 30 deletions opensearch_py_ml/ml_commons/model_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Any modifications Copyright OpenSearch Contributors. See
# GitHub history for details.

import hashlib
import json
import os
from math import ceil
Expand All @@ -14,7 +13,6 @@
from opensearchpy import OpenSearch

from opensearch_py_ml.ml_commons.ml_common_utils import (
BUF_SIZE,
EMBEDDING_DIMENSION,
FRAMEWORK_TYPE,
META_API_ENDPOINT,
Expand All @@ -30,6 +28,7 @@
MODEL_TYPE,
MODEL_VERSION_FIELD,
TOTAL_CHUNKS_FIELD,
_generate_model_content_hash_value,
)


Expand Down Expand Up @@ -86,7 +85,7 @@ def _register_model(
total_num_chunks: int = ceil(model_content_size_in_bytes / MODEL_CHUNK_MAX_SIZE)

# we are generating the sha1 hash for the model zip file
hash_val_model_file = self._generate_hash(model_path)
hash_val_model_file = _generate_model_content_hash_value(model_path)

if isVerbose:
print("Total number of chunks", total_num_chunks)
Expand Down Expand Up @@ -189,30 +188,3 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
return True
else:
raise ValueError("Model metadata can't be empty")

def _generate_hash(self, model_file_path: str) -> str:
"""
Generate sha1 hash value for the model zip file.
Parameters
----------
:param model_file_path: file path of the model file
:type model_file_path: string
Returns
-------
:return: sha256 hash
:rtype: string
"""

sha256 = hashlib.sha256()
with open(model_file_path, "rb") as file:
while True:
chunk = file.read(BUF_SIZE)
if not chunk:
break
sha256.update(chunk)
sha256_value = sha256.hexdigest()
return sha256_value
29 changes: 29 additions & 0 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
from transformers import TrainingArguments, get_linear_schedule_with_warmup
from transformers.convert_graph_to_onnx import convert

from opensearch_py_ml.ml_commons.ml_common_utils import (
_generate_model_content_hash_value,
)


class SentenceTransformerModel:
"""
Expand Down Expand Up @@ -86,6 +90,8 @@ def __init__(
)

self.model_id = model_id
self.torch_script_zip_file_path = None
self.onnx_zip_file_path = None

def train(
self,
Expand Down Expand Up @@ -701,6 +707,7 @@ def zip_model(
tokenizer_json_path,
zip_file_name_without_extension + "/" + "tokenizer.json",
)

print("zip file is saved to " + zip_file_path + "\n")

def _fill_null_truncation_field(
Expand Down Expand Up @@ -836,6 +843,7 @@ def save_as_pt(
os.path.join(save_json_folder_path, "tokenizer.json"),
arcname="tokenizer.json",
)
self.torch_script_zip_file_path = zip_file_path
print("zip file is saved to ", zip_file_path, "\n")
return zip_file_path

Expand Down Expand Up @@ -926,6 +934,8 @@ def save_as_onnx(
os.path.join(save_json_folder_path, "tokenizer.json"),
arcname="tokenizer.json",
)

self.onnx_zip_file_path = zip_file_path
print("zip file is saved to ", zip_file_path, "\n")
return zip_file_path

Expand Down Expand Up @@ -1099,6 +1109,7 @@ def make_model_config_json(
model_name: str = None,
version_number: str = 1,
model_format: str = "TORCH_SCRIPT",
model_zip_file_path: str = None,
embedding_dimension: int = None,
pooling_mode: str = None,
normalize_result: bool = None,
Expand Down Expand Up @@ -1239,6 +1250,24 @@ def make_model_config_json(
},
}

if model_zip_file_path is None:
model_zip_file_path = (
self.torch_script_zip_file_path
if model_format == "TORCH_SCRIPT"
else self.onnx_zip_file_path
)
if model_zip_file_path is None:
print(
"Set model_zip_file_path parameter to add the field 'model_content_size_in_bytes' and 'model_content_hash_value' to model config json file."
)
else:
model_config_content["model_content_size_in_bytes"] = os.stat(
model_zip_file_path
).st_size
model_config_content[
"model_content_hash_value"
] = _generate_model_content_hash_value(model_zip_file_path)

if verbose:
print("generating ml-commons_model_config.json file...\n")
print(model_config_content)
Expand Down
7 changes: 5 additions & 2 deletions tests/ml_commons/test_model_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import pytest
from opensearchpy.client import OpenSearch

from opensearch_py_ml.ml_commons.ml_common_utils import (
_generate_model_content_hash_value,
)
from opensearch_py_ml.ml_commons.model_uploader import ModelUploader
from tests import FLIGHTS_SMALL_FILE_NAME

Expand Down Expand Up @@ -88,8 +91,8 @@ def test_check_mandatory_field():
model_uploader._check_mandatory_field(model_meta)


def test_generate_hash():
def test_generate_model_content_hash_value():
assert (
"18521f420cf85149025b75df329689c416be0ce3fc78b2afdfdf177654b77b34"
== model_uploader._generate_hash(FLIGHTS_SMALL_FILE_NAME)
== _generate_model_content_hash_value(FLIGHTS_SMALL_FILE_NAME)
)

0 comments on commit 1839e2e

Please sign in to comment.