Enable the model upload workflow to add model_content_size_in_bytes…

… & `model_content_hash_value` to model config automatically (#291) * Preliminary work: Add generate_hash to utils Signed-off-by: thanawan-atc <[email protected]> * More changes Signed-off-by: thanawan-atc <[email protected]> * Debug verbose Signed-off-by: thanawan-atc <[email protected]> * Linting Signed-off-by: thanawan-atc <[email protected]> * Move license step Signed-off-by: thanawan-atc <[email protected]> * Debug tests Signed-off-by: thanawan-atc <[email protected]> * Correct linting Signed-off-by: thanawan-atc <[email protected]> * Decompose functions Signed-off-by: thanawan-atc <[email protected]> * Bypassing Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Bypassing Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Add license to model uploader workflow Signed-off-by: thanawan-atc <[email protected]> * Fix LICENSE path bug Signed-off-by: thanawan-atc <[email protected]> * Get LICENSE with requests Signed-off-by: thanawan-atc <[email protected]> * Update CHANGELOG.md Signed-off-by: thanawan-atc <[email protected]> * Debug license adding Signed-off-by: thanawan-atc <[email protected]> * Debug Signed-off-by: thanawan-atc <[email protected]> * Fix license url Signed-off-by: thanawan-atc <[email protected]> * Update sentencetransformermodel.py Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Update model_uploader.yml Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Update model_uploader.yml Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Update model_uploader.yml Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Update model_uploader.yml Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> * Update model_uploader.yml Signed-off-by: Thanawan Atchariyachanvanit <[email protected]> --------- Signed-off-by: thanawan-atc <[email protected]> Signed-off-by: Thanawan Atchariyachanvanit <[email protected]>
opensearch-project · Sep 19, 2023 · 0a7af98 · 0a7af98
1 parent 5441861
commit 0a7af98
Show file tree

Hide file tree

Showing 8 changed files with 369 additions and 210 deletions.
diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml
@@ -42,6 +42,14 @@ on:
         description: "(Optional) Description (Specify here if you want to overwrite the default model description)"
         required: false
         type: string
+      allow_overwrite:
+        description: "Allow the workflow to overwrite model in model hub"
+        required: true
+        type: choice
+        options:
+        - "NO"
+        - "YES"
+
 
 jobs:
   # Step 2: Initiate workflow variable
@@ -71,6 +79,7 @@ jobs:
         - Workflow Name: ${{ github.workflow }}
         - Workflow Run ID: ${{ github.run_id }}
         - Workflow Initiator: @${{ github.actor }}
+        - Aloow Overwrite:  ${{ github.event.inputs.allow_overwrite }}
           
         ========= Workflow Input Information =========
         - Model ID: ${{ github.event.inputs.model_id }}
@@ -102,6 +111,7 @@ jobs:
   # Step 3: Check if the model already exists in the model hub
   checking-out-model-hub:
     needs: init-workflow-var
+    if: github.event.inputs.allow_overwrite == 'NO'
     runs-on: 'ubuntu-latest'
     permissions:
       id-token: write
@@ -148,6 +158,7 @@ jobs:
   # Step 4: Trace the model, Verify the embeddings & Upload the model files as artifacts
   model-auto-tracing:
     needs: [init-workflow-var, checking-out-model-hub]
+    if:  always() && needs.init-workflow-var.result == 'success' && (needs.checking-out-model-hub.result == 'success' || needs.checking-out-model-hub.result == 'skipped')
     name: model-auto-tracing
     runs-on: ubuntu-latest
     permissions:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Update pretrained_models_all_versions.json (2023-09-08 13:14:07) by @dhrubo-os ([#277](https://github.com/opensearch-project/opensearch-py-ml/pull/277))
 - Update model upload history -  sentence-transformers/distiluse-base-multilingual-cased-v1 (v.1.0.1)(TORCH_SCRIPT) by @dhrubo-os ([#281](https://github.com/opensearch-project/opensearch-py-ml/pull/281))
 - Update pretrained_models_all_versions.json (2023-09-14 10:28:41) by @dhrubo-os ([#282](https://github.com/opensearch-project/opensearch-py-ml/pull/282))
+Enable the model upload workflow to add model_content_size_in_bytes & model_content_hash_value to model config automatically @thanawan-atc ([#291](https://github.com/opensearch-project/opensearch-py-ml/pull/291))
 
 ### Fixed
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))

diff --git a/opensearch_py_ml/ml_commons/ml_common_utils.py b/opensearch_py_ml/ml_commons/ml_common_utils.py
@@ -5,6 +5,8 @@
 # Any modifications Copyright OpenSearch Contributors. See
 # GitHub history for details.
 
+import hashlib
+
 ML_BASE_URI = "/_plugins/_ml"
 MODEL_CHUNK_MAX_SIZE = 10_000_000
 MODEL_MAX_SIZE = 4_000_000_000
@@ -22,3 +24,31 @@
 FRAMEWORK_TYPE = "framework_type"
 MODEL_CONTENT_HASH_VALUE = "model_content_hash_value"
 MODEL_GROUP_ID = "model_group_id"
+
+
+def _generate_model_content_hash_value(model_file_path: str) -> str:
+    """
+    Generate sha1 hash value for the model zip file.
+
+    Parameters
+    ----------
+    :param model_file_path: file path of the model file
+    :type model_file_path: string
+
+
+    Returns
+    -------
+    :return: sha256 hash
+    :rtype: string
+
+    """
+
+    sha256 = hashlib.sha256()
+    with open(model_file_path, "rb") as file:
+        while True:
+            chunk = file.read(BUF_SIZE)
+            if not chunk:
+                break
+            sha256.update(chunk)
+    sha256_value = sha256.hexdigest()
+    return sha256_value
diff --git a/opensearch_py_ml/ml_commons/model_uploader.py b/opensearch_py_ml/ml_commons/model_uploader.py
@@ -5,7 +5,6 @@
 # Any modifications Copyright OpenSearch Contributors. See
 # GitHub history for details.
 
-import hashlib
 import json
 import os
 from math import ceil
@@ -14,7 +13,6 @@
 from opensearchpy import OpenSearch
 
 from opensearch_py_ml.ml_commons.ml_common_utils import (
-    BUF_SIZE,
     EMBEDDING_DIMENSION,
     FRAMEWORK_TYPE,
     META_API_ENDPOINT,
@@ -30,6 +28,7 @@
     MODEL_TYPE,
     MODEL_VERSION_FIELD,
     TOTAL_CHUNKS_FIELD,
+    _generate_model_content_hash_value,
 )
 
 
@@ -85,21 +84,27 @@ def _register_model(
         model_content_size_in_bytes = os.stat(model_path).st_size
         total_num_chunks: int = ceil(model_content_size_in_bytes / MODEL_CHUNK_MAX_SIZE)
 
-        # we are generating the sha1 hash for the model zip file
-        hash_val_model_file = self._generate_hash(model_path)
-
         if isVerbose:
             print("Total number of chunks", total_num_chunks)
-            print("Sha1 value of the model file: ", hash_val_model_file)
 
         model_meta_json_file = open(model_meta_path)
 
         model_meta_json: dict[str, Union[str, dict[str, str]]] = json.load(
             model_meta_json_file
         )
         model_meta_json[TOTAL_CHUNKS_FIELD] = total_num_chunks
-        model_meta_json[MODEL_CONTENT_SIZE_IN_BYTES_FIELD] = model_content_size_in_bytes
-        model_meta_json[MODEL_CONTENT_HASH_VALUE] = hash_val_model_file
+
+        if MODEL_CONTENT_SIZE_IN_BYTES_FIELD not in model_meta_json:
+            model_meta_json[
+                MODEL_CONTENT_SIZE_IN_BYTES_FIELD
+            ] = model_content_size_in_bytes
+        if MODEL_CONTENT_HASH_VALUE not in model_meta_json:
+            # Generate the sha1 hash for the model zip file
+            hash_val_model_file = _generate_model_content_hash_value(model_path)
+            model_meta_json[MODEL_CONTENT_HASH_VALUE] = hash_val_model_file
+            if isVerbose:
+                print("Sha1 value of the model file: ", hash_val_model_file)
+
         model_meta_json[MODEL_GROUP_ID] = model_group_id
 
         if self._check_mandatory_field(model_meta_json):
@@ -189,30 +194,3 @@ def _check_mandatory_field(self, model_meta: dict) -> bool:
             return True
         else:
             raise ValueError("Model metadata can't be empty")
-
-    def _generate_hash(self, model_file_path: str) -> str:
-        """
-        Generate sha1 hash value for the model zip file.
-
-        Parameters
-        ----------
-        :param model_file_path: file path of the model file
-        :type model_file_path: string
-
-
-        Returns
-        -------
-        :return: sha256 hash
-        :rtype: string
-
-        """
-
-        sha256 = hashlib.sha256()
-        with open(model_file_path, "rb") as file:
-            while True:
-                chunk = file.read(BUF_SIZE)
-                if not chunk:
-                    break
-                sha256.update(chunk)
-        sha256_value = sha256.hexdigest()
-        return sha256_value